In [1]:
%run setup.ipynb
%matplotlib inline
# %load_ext autoreload
# %autoreload 1
# %aimport hapclust
import hapclust
import ag1k
In [2]:
# load data
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
ann = callset_haps['ANN']
In [3]:
# chop into gene
loc_vgsc = pos.locate_range(region_vgsc.start, region_vgsc.end)
h_vgsc = haps[loc_vgsc]
pos_vgsc = pos[loc_vgsc]
h_vgsc
Out[3]:
In [4]:
# load haplotype metadata
df_haplotypes = ag1k.phase1_ar31.df_haplotypes.query('population != "colony"')
df_haplotypes.head()
Out[4]:
In [5]:
assert len(df_haplotypes) == h_vgsc.shape[1]
In [6]:
df_haplotypes.groupby(by=('country', 'population')).count()[['label']]
Out[6]:
In [7]:
hap_pops = df_haplotypes.population.values
# need to use named colors for graphviz
pop_colors = {
'AOM': 'brown',
'BFM': 'firebrick1',
'GWA': 'goldenrod1',
'GNS': 'cadetblue1',
'BFS': 'deepskyblue',
'CMS': 'dodgerblue3',
'UGS': 'palegreen',
'GAS': 'olivedrab',
'KES': 'grey47',
'colony': 'black'
}
hap_colors = np.array([pop_colors[p] for p in hap_pops])
In [8]:
# make an array of original haplotype indices, we'll need this later on to help
# figure out cluster concordances
hap_ixs = np.arange(len(df_haplotypes))
hap_ixs
Out[8]:
In [9]:
# obtain labels for non-synonymous variants
tbl_variant_labels = (
etl
.frompickle('../data/tbl_variants_phase1.pkl')
# .eq('num_alleles', 2)
.cut('POS', 'AGAP004707-RA')
.convert('AGAP004707-RA', lambda v: v[1] if v[0] == 'NON_SYNONYMOUS_CODING' else '')
.rename('AGAP004707-RA', 'label')
)
tbl_variant_labels
Out[9]:
In [10]:
len(tbl_variant_labels)
Out[10]:
In [11]:
pos2label = tbl_variant_labels.lookupone('POS', 'label')
pos2label[2358254]
Out[11]:
In [12]:
variant_labels = np.array([pos2label.get(p, '') for p in pos_vgsc], dtype=object)
variant_labels[:5]
Out[12]:
In [13]:
pos_vgsc
Out[13]:
In [14]:
# check network construction works - network all haplotypes
graph, _ = hapclust.graph_haplotype_network(
h_vgsc,
network_method='msn',
hap_colors=hap_colors,
max_dist= 2,
variant_labels=variant_labels,
show_singletons=False)
graph
Out[14]:
In [15]:
# locate haplotypes with resistance alleles
pos_995S = 2422651
pos_995F = 2422652
pos_1527T = 2429617
pos_490I = 2400071
loc_995S = h_vgsc[pos_vgsc.locate_key(pos_995S)] == 1
loc_995F = h_vgsc[pos_vgsc.locate_key(pos_995F)] == 1
loc_1527T = h_vgsc[pos_vgsc.locate_key(pos_1527T)] == 1
loc_490I = h_vgsc[pos_vgsc.locate_key(pos_490I)] == 1
In [16]:
pop_995F = df_haplotypes.population.iloc[loc_995F]
pop_995F.head()
Out[16]:
In [17]:
pop_995F.value_counts()
Out[17]:
In [18]:
pop_995S = df_haplotypes.population.iloc[loc_995S]
pop_995S.head()
Out[18]:
In [19]:
pop_995S.value_counts()
Out[19]:
In [20]:
pop_1527T = df_haplotypes.population.iloc[loc_1527T]
pop_1527T.value_counts()
Out[20]:
In [21]:
pop_490I = df_haplotypes.population.iloc[loc_490I]
pop_490I.value_counts()
Out[21]:
In [22]:
np.count_nonzero(loc_995F), np.count_nonzero(loc_995S), np.count_nonzero(loc_1527T), np.count_nonzero(loc_490I)
Out[22]:
In [23]:
# resistance haps
h_vgsc_995F = h_vgsc.compress(loc_995F, axis=1)
h_vgsc_995S = h_vgsc.compress(loc_995S, axis=1)
h_vgsc_1527T = h_vgsc.compress(loc_1527T, axis=1)
h_vgsc_490I = h_vgsc.compress(loc_490I, axis=1)
In [24]:
# colours
hap_colors_995F = hap_colors.compress(loc_995F)
hap_colors_995S = hap_colors.compress(loc_995S)
hap_colors_1527T = hap_colors.compress(loc_1527T)
hap_colors_490I = hap_colors.compress(loc_490I)
In [25]:
# original haplotype indices - will need these for cluster correspondence
hap_ixs_995F = hap_ixs.compress(loc_995F)
hap_ixs_995S = hap_ixs.compress(loc_995S)
hap_ixs_1527T = hap_ixs.compress(loc_1527T)
hap_ixs_490I = hap_ixs.compress(loc_490I)
In [26]:
graph_995F, h_distinct_sets_995F, components_995F = hapclust.graph_haplotype_network(
h_vgsc_995F,
network_method='mjn',
max_dist=2,
hap_colors=hap_colors_995F,
variant_labels=variant_labels,
return_components=True, # new option, N.B., changes return values
show_singletons=False, # new option, don't show unconnected singleton nodes, makes graph less noisy
)
graph_995F.format = 'svg'
fn = '../artwork/995F_clusters_mjn_maxdist2'
graph_995F.render(fn)
graph_995F
Out[26]:
In [27]:
# this tells you, for each *distinct* haplotype (i.e., node in the graph),
# which "connected component" (i.e., cluster) it belongs to
components_995F
Out[27]:
In [28]:
# this tells you the number of distinct haplotypes for each connected component
# e.g., component 0 has 51 *distinct* haplotypes (= 51 nodes in the graph)
# e.g., component 8 has 18 *distinct* haplotypes (= 18 nodes in the graph)
np.bincount(components_995F)
Out[28]:
In [29]:
def identify_components(h_distinct_sets, components):
"""This function is designed to collect all indices for original haplotypes
within each connected component. I.e., it finds clusters from the network."""
clusters = []
for c in np.unique(components):
cluster = set()
for i in np.nonzero(components == c)[0]:
cluster |= h_distinct_sets[i]
clusters.append(sorted(cluster))
return clusters
In [30]:
network_clusters_995F = identify_components(h_distinct_sets_995F, components_995F)
len(network_clusters_995F)
Out[30]:
In [31]:
len(network_clusters_995F[0])
Out[31]:
In [32]:
# load up hierarchical clustering results from previous analysis
hierarchical_cluster_membership = np.load('../data/hierarchical_cluster_membership.npy').astype('U')
hierarchical_clusters = {('other' if not k else k): set(np.nonzero(hierarchical_cluster_membership == k)[0])
for k in np.unique(hierarchical_cluster_membership)}
for k in sorted(hierarchical_clusters):
print(repr(k), len(hierarchical_clusters[k]))
In [33]:
len(hierarchical_clusters['F1'])
Out[33]:
In [34]:
# setup to quantify overall concordance
network_cluster_membership = np.full_like(hierarchical_cluster_membership, fill_value='')
network_cluster_membership[loc_995F] = 'FX'
network_cluster_membership[loc_995S] = 'SX'
In [35]:
def find_concordance_995F(nc_ix, hc):
# network cluster, mapped back to original haplotype indices
network_cluster = set(hap_ixs_995F.take(network_clusters_995F[nc_ix]))
# find intersection
hierarchical_cluster = hierarchical_clusters[hc]
n_isec = len(hierarchical_cluster.intersection(network_cluster))
# outputs
print(repr(hc), n_isec, len(hierarchical_cluster), len(network_cluster),
'{:.1f}% concordance'.format(n_isec * 100 / len(hierarchical_cluster)))
# assign
network_cluster_membership[list(network_cluster)] = hc
In [36]:
# find the network components with more than 2 haplotypes
for i, l in enumerate(network_clusters_995F):
if len(l) > 2:
print(i, len(l))
In [37]:
pop_995F.iloc[network_clusters_995F[0]].value_counts()
Out[37]:
In [38]:
find_concordance_995F(0, 'F1')
In [39]:
pop_995F.iloc[network_clusters_995F[8]].value_counts()
Out[39]:
In [40]:
find_concordance_995F(8, 'F5')
In [41]:
find_concordance_995F(10, 'F4')
In [42]:
find_concordance_995F(9, 'F3')
In [43]:
find_concordance_995F(11, 'F2')
In [44]:
# compute nucleotide diversity within clusters
is_accessible = phase1_ar3.accessibility['2L/is_accessible'][:]
is_accessible_vgsc = is_accessible[region_vgsc.loc0]
n_bp_accessible_vgsc = np.count_nonzero(is_accessible_vgsc)
n_bp_accessible_vgsc
Out[44]:
In [45]:
def compute_pi_995F():
for i, s in enumerate(network_clusters_995F):
if len(s) >= 3:
h = h_vgsc_995F.take(s, axis=1)
ac = h.count_alleles(max_allele=1)
mpd = allel.mean_pairwise_difference(ac)
pi = np.sum(mpd) / n_bp_accessible_vgsc
print(i, pi)
compute_pi_995F()
In [46]:
graph_995S, h_distinct_sets_995S, components_995S = hapclust.graph_haplotype_network(
h_vgsc_995S,
network_method='mjn',
max_dist=2,
hap_colors=hap_colors_995S,
variant_labels=variant_labels,
return_components=True, # new option, N.B., changes return values
show_singletons=False, # new option, don't show unconnected singleton nodes, makes graph less noisy
)
graph_995S.format = 'svg'
fn = '../artwork/995S_clusters_mjn_maxdist2'
graph_995S.render(fn)
graph_995S
Out[46]:
In [47]:
network_clusters_995S = identify_components(h_distinct_sets_995S, components_995S)
for i, l in enumerate(network_clusters_995S):
if len(l) > 2:
print(i, len(l))
In [48]:
def find_concordance_995S(nc_ix, hc):
# network cluster, mapped back to original haplotype indices
network_cluster = set(hap_ixs_995S.take(network_clusters_995S[nc_ix]))
# find intersection
hierarchical_cluster = hierarchical_clusters[hc]
n_isec = len(hierarchical_cluster.intersection(network_cluster))
# outputs
print(repr(hc), n_isec, len(hierarchical_cluster), len(network_cluster),
'{:.1f}% concordance'.format(n_isec * 100 / len(hierarchical_cluster)))
# assign
network_cluster_membership[list(network_cluster)] = hc
In [49]:
find_concordance_995S(0, 'S1')
In [50]:
find_concordance_995S(1, 'S3')
In [51]:
find_concordance_995S(3, 'S4')
In [52]:
find_concordance_995S(5, 'S5')
In [53]:
find_concordance_995S(8, 'S2')
In [54]:
# overall concordance
# for the comparison we ignore the FX and SX classes which are not output from the hierarchical clustering
network_cluster_membership_compare = network_cluster_membership.copy()
network_cluster_membership_compare[network_cluster_membership == 'FX'] = ''
network_cluster_membership_compare[network_cluster_membership == 'SX'] = ''
np.count_nonzero(hierarchical_cluster_membership == network_cluster_membership_compare) * 100 / len(hierarchical_cluster_membership)
Out[54]:
In [55]:
np.unique(hierarchical_cluster_membership)
Out[55]:
In [56]:
np.unique(network_cluster_membership)
Out[56]:
In [57]:
np.unique(network_cluster_membership_compare)
Out[57]:
In [58]:
def compute_pi_995S():
for i, s in enumerate(network_clusters_995S):
if len(s) >= 3:
h = h_vgsc_995S.take(s, axis=1)
ac = h.count_alleles(max_allele=1)
mpd = allel.mean_pairwise_difference(ac)
pi = np.sum(mpd) / n_bp_accessible_vgsc
print(i, pi)
compute_pi_995S()
In [59]:
graph_1527T, h_distinct_sets_1527T, components_1527T = hapclust.graph_haplotype_network(
h_vgsc_1527T,
network_method='mjn',
max_dist=15,
hap_colors=hap_colors_1527T,
variant_labels=variant_labels,
return_components=True, # new option, N.B., changes return values
show_singletons=True, # new option, don't show unconnected singleton nodes, makes graph less noisy
)
graph_1527T.format = 'svg'
fn = '../artwork/1527_clusters_mjn_maxdist2'
graph_1527T.render(fn)
graph_1527T
Out[59]:
In [60]:
variant_labels[838]
Out[60]:
In [61]:
network_clusters_1527T = identify_components(h_distinct_sets_1527T, components_1527T)
for i, l in enumerate(network_clusters_1527T):
if len(l) > 0:
print(i, len(l))
In [62]:
h_vgsc_1527T[838]
Out[62]:
In [63]:
h_vgsc_1527T[838, network_clusters_1527T[0]]
Out[63]:
In [64]:
h_vgsc_1527T[838, network_clusters_1527T[1]]
Out[64]:
In [65]:
h_vgsc_1527T[838, network_clusters_1527T[2]]
Out[65]:
In [66]:
h_vgsc_1527T[838, network_clusters_1527T[3]]
Out[66]:
In [67]:
h_vgsc_1527T[838, network_clusters_1527T[4]]
Out[67]:
In [68]:
h_vgsc_1527T[838, network_clusters_1527T[5]]
Out[68]:
So two main network components, one for each of the two V402L alternate alleles.
In [69]:
network_cluster_membership[loc_1527T] = 'L1'
In [70]:
graph_490I, h_distinct_sets_490I, components_490I = hapclust.graph_haplotype_network(
h_vgsc_490I,
network_method='mjn',
max_dist=15,
hap_colors=hap_colors_490I,
variant_labels=variant_labels,
return_components=True, # new option, N.B., changes return values
show_singletons=True, # new option, don't show unconnected singleton nodes, makes graph less noisy
)
graph_490I.format = 'svg'
fn = '../artwork/490_clusters_mjn_maxdist2'
graph_490I.render(fn)
graph_490I
Out[70]:
In [71]:
network_cluster_membership[loc_490I] = 'L2'
In [72]:
np.save('../data/median_joining_network_membership.npy', network_cluster_membership)
In [73]:
collections.Counter(network_cluster_membership).most_common()
Out[73]:
In [74]:
def compute_wt_diversity(pop):
loc_pop = (df_haplotypes.population == pop).values
loc_pop.shape
h_vgsc_wt_pop = h_vgsc.compress(~loc_995S & ~loc_995F & ~loc_1527T & loc_pop, axis=1)
ac = h_vgsc_wt_pop.count_alleles(max_allele=1)
mpd = allel.mean_pairwise_difference(ac)
pi = np.sum(mpd) / n_bp_accessible_vgsc
return h_vgsc_wt_pop.shape, pi
In [75]:
compute_wt_diversity('CMS')
Out[75]:
In [76]:
compute_wt_diversity('GWA')
Out[76]:
In [77]:
compute_wt_diversity('BFM')
Out[77]:
In [78]:
compute_wt_diversity('AOM')
Out[78]: