Generate median joining network artwork and cluster membership


In [1]:
%run setup.ipynb
%matplotlib inline
# %load_ext autoreload
# %autoreload 1
# %aimport hapclust
import hapclust
import ag1k

data setup


In [2]:
# load data
callset_haps = np.load('../data/haps_phase1.npz')
haps = allel.HaplotypeArray(callset_haps['haplotypes'])
pos = allel.SortedIndex(callset_haps['POS'])
ann = callset_haps['ANN']

In [3]:
# chop into gene
loc_vgsc = pos.locate_range(region_vgsc.start, region_vgsc.end)
h_vgsc = haps[loc_vgsc]
pos_vgsc = pos[loc_vgsc]
h_vgsc


Out[3]:
<HaplotypeArray shape=(1713, 1530) dtype=int8>
01234...15251526152715281529
000000...00000
100000...00000
200000...00000
......
171000000...00000
171100000...00000
171200000...00000

In [4]:
# load haplotype metadata
df_haplotypes = ag1k.phase1_ar31.df_haplotypes.query('population != "colony"')
df_haplotypes.head()


Out[4]:
label ox_code population label_aug country region sex m_s kt_2la kt_2rb
index
0 AB0085-Ca AB0085-C BFS AB0085-Ca [Burkina Faso, Pala, S, F] Burkina Faso Pala F S 2.0 2.0
1 AB0085-Cb AB0085-C BFS AB0085-Cb [Burkina Faso, Pala, S, F] Burkina Faso Pala F S 2.0 2.0
2 AB0087-Ca AB0087-C BFM AB0087-Ca [Burkina Faso, Bana, M, F] Burkina Faso Bana F M 2.0 1.0
3 AB0087-Cb AB0087-C BFM AB0087-Cb [Burkina Faso, Bana, M, F] Burkina Faso Bana F M 2.0 1.0
4 AB0088-Ca AB0088-C BFM AB0088-Ca [Burkina Faso, Bana, M, F] Burkina Faso Bana F M 2.0 0.0

In [5]:
assert len(df_haplotypes) == h_vgsc.shape[1]

In [6]:
df_haplotypes.groupby(by=('country', 'population')).count()[['label']]


Out[6]:
label
country population
Angola AOM 120
Burkina Faso BFM 138
BFS 162
Cameroon CMS 550
Gabon GAS 112
Guinea GNS 62
Guinea-Bissau GWA 92
Kenya KES 88
Uganda UGS 206

In [7]:
hap_pops = df_haplotypes.population.values
# need to use named colors for graphviz
pop_colors = {
    'AOM': 'brown',
    'BFM': 'firebrick1',
    'GWA': 'goldenrod1',
    'GNS': 'cadetblue1',
    'BFS': 'deepskyblue',
    'CMS': 'dodgerblue3',
    'UGS': 'palegreen',
    'GAS': 'olivedrab',
    'KES': 'grey47',
    'colony': 'black'
}
hap_colors = np.array([pop_colors[p] for p in hap_pops])

In [8]:
# make an array of original haplotype indices, we'll need this later on to help
# figure out cluster concordances
hap_ixs = np.arange(len(df_haplotypes))
hap_ixs


Out[8]:
array([   0,    1,    2, ..., 1527, 1528, 1529])

In [9]:
# obtain labels for non-synonymous variants
tbl_variant_labels = (
    etl
    .frompickle('../data/tbl_variants_phase1.pkl')
#     .eq('num_alleles', 2)
    .cut('POS', 'AGAP004707-RA')
    .convert('AGAP004707-RA', lambda v: v[1] if v[0] == 'NON_SYNONYMOUS_CODING' else '')
    .rename('AGAP004707-RA', 'label')
)
tbl_variant_labels


Out[9]:
0|POS 1|label
2358254 D33N
2358316
2358328
2358353
2358405

...


In [10]:
len(tbl_variant_labels)


Out[10]:
7279

In [11]:
pos2label = tbl_variant_labels.lookupone('POS', 'label')
pos2label[2358254]


Out[11]:
'D33N'

In [12]:
variant_labels = np.array([pos2label.get(p, '') for p in pos_vgsc], dtype=object)
variant_labels[:5]


Out[12]:
array(['D33N', '', '', '', ''], dtype=object)

In [13]:
pos_vgsc


Out[13]:
<SortedIndex shape=(1713,) dtype=int32>
01234...17081709171017111712
23582542358316235832823583532358405...24314172431487243151824315272431542

In [14]:
# check network construction works - network all haplotypes
graph, _ = hapclust.graph_haplotype_network(
    h_vgsc, 
    network_method='msn', 
    hap_colors=hap_colors, 
    max_dist= 2, 
    variant_labels=variant_labels,
    show_singletons=False)
graph


Out[14]:
%3 0 4 0->4 1 16 1->16 24 1->24 28 1->28 29 1->29 2 8 2->8 3 40 3->40 anon_3_20_0 3->anon_3_20_0 12 4->12 14 4->14 32 4->32 191 4->191 I829L 319 4->319 322 4->322 383 4->383 442 4->442 5 41 5->41 47 5->47 50 5->50 51 5->51 6 15 6->15 441 6->441 7 7->1 I1868T 7->3 A1934V 7->4 N1570Y 7->5 P1874S 7->6 P1874L 9 7->9 K1603T 18 7->18 I1940T 19 7->19 T791M 23 7->23 N1345I 25 7->25 E1597G 27 7->27 V1853I 31 7->31 P55L 37 7->37 V1303A 389 7->389 390 7->390 401 7->401 402 7->402 anon_7_58_0 7->anon_7_58_0 anon_8_54_0 8->anon_8_54_0 10 17 10->17 11 13 11->13 34 13->34 anon_13_21_0 13->anon_13_21_0 22 18->22 M704V 35 18->35 19->17 A1746S anon_19_26_0 19->anon_19_26_0 20 21 38 25->38 42 25->42 26 33 36 33->36 N1570Y 39 54 58 59 60 59->60 62 60->62 64 60->64 66 60->66 I492N 68 60->68 69 60->69 70 60->70 71 60->71 73 60->73 77 60->77 M925L 79 60->79 81 60->81 anon_60_65_0 60->anon_60_65_0 61 63 61->63 72 61->72 76 61->76 F1529L 78 61->78 175 61->175 183 61->183 184 61->184 anon_61_180_0 61->anon_61_180_0 anon_61_182_0 61->anon_61_182_0 80 63->80 anon_63_75_0 63->anon_63_75_0 65 67 67->68 67->71 75 88 anon_88_123_0 88->anon_88_123_0 96 anon_96_172_0 96->anon_96_172_0 114 163 114->163 122 anon_122_126_0 122->anon_122_126_0 123 126 123->126 149 259 149->259 150 150->163 155 anon_155_163_0 155->anon_155_163_0 157 157->163 172 174 176 174->176 177 174->177 anon_174_178_0 174->anon_174_178_0 181 175->181 186 175->186 178 179 185 179->185 180 182 187 304 187->304 314 187->314 346 187->346 188 201 188->201 202 188->202 212 188->212 241 188->241 268 188->268 279 188->279 291 188->291 301 188->301 327 188->327 332 188->332 361 188->361 377 188->377 384 188->384 A837V 434 188->434 anon_188_336_0 188->anon_188_336_0 189 190 283 190->283 192 193 335 193->335 360 193->360 370 193->370 194 196 221 196->221 342 196->342 anon_196_259_0 196->anon_196_259_0 197 205 197->205 G317S 252 197->252 anon_193_197_0 197->anon_193_197_0 I1940T 198 211 198->211 203 206 203->206 203->211 F1920S 269 203->269 430 203->430 R537L 435 203->435 H1755Y 436 203->436 440 203->440 K251R 204 207 207->203 L995F 343 207->343 208 274 208->274 289 208->289 307 208->307 S2076T 318 208->318 323 208->323 L1617F 209 372 209->372 209->390 anon_209_317_0 209->anon_209_317_0 210 anon_210_263_0 210->anon_210_263_0 437 211->437 213 251 213->251 214 378 214->378 anon_214_219_0 214->anon_214_219_0 215 215->377 218 anon_218_278_0 218->anon_218_278_0 219 220 anon_220_231_0 220->anon_220_231_0 anon_220_239_0 220->anon_220_239_0 anon_220_273_0 220->anon_220_273_0 223 anon_223_251_0 223->anon_223_251_0 226 288 226->288 anon_226_357_0 226->anon_226_357_0 229 229->251 231 235 anon_235_265_0 235->anon_235_265_0 236 236->197 L995F anon_236_348_0 236->anon_236_348_0 239 247 anon_247_382_0 247->anon_247_382_0 248 250 260 250->260 367 250->367 286 251->286 297 251->297 381 251->381 anon_251_305_0 251->anon_251_305_0 253 310 253->310 R254K 324 253->324 L995F 256 256->274 256->318 258 anon_259_302_0 259->anon_259_302_0 263 264 424 264->424 428 264->428 429 264->429 431 264->431 432 264->432 438 264->438 439 264->439 K251R 265 273 anon_273_325_0 273->anon_273_325_0 278 anon_278_326_0 278->anon_278_326_0 anon_278_338_0 278->anon_278_338_0 anon_278_355_0 278->anon_278_355_0 281 anon_281_350_0 281->anon_281_350_0 284 283->284 366 283->366 anon_283_300_0 283->anon_283_300_0 285 anon_285_363_0 285->anon_285_363_0 308 286->308 287 299 287->299 296 328 296->328 298 300 302 303 305 306 352 307->352 387 307->387 309 317 309->317 310->188 L995F 310->210 L995S 316 310->316 371 310->371 312 313 315 326 315->326 318->352 S2076T 324->188 R254K 325 336 338 341 386 341->386 348 350 355 355->209 L995F 355->312 L995S anon_355_364_0 355->anon_355_364_0 357 363 364 375 388 375->388 382 385 385->4 L995F 390->12 N1570Y 390->15 P1874L 390->40 A1934V 425 424->425 427 424->427 427->429 anon_3_20_0->20 anon_7_58_0->58 P940H anon_8_54_0->54 anon_13_21_0->21 anon_19_26_0->26 L1366F anon_60_65_0->65 anon_61_180_0->180 anon_61_182_0->182 anon_63_75_0->75 anon_88_123_0->123 anon_96_172_0->172 anon_122_126_0->126 anon_155_163_0->163 anon_174_178_0->178 anon_188_336_0->336 anon_193_197_0->193 D466H anon_196_259_0->259 anon_209_317_0->317 anon_210_263_0->263 anon_214_219_0->219 anon_218_278_0->278 anon_220_231_0->231 anon_220_239_0->239 anon_220_273_0->273 anon_223_251_0->251 anon_226_357_0->357 anon_235_265_0->265 anon_236_348_0->348 anon_247_382_0->382 anon_251_305_0->305 anon_259_302_0->302 anon_273_325_0->325 anon_278_326_0->326 anon_278_338_0->338 T679S anon_278_355_0->355 anon_281_350_0->350 anon_283_300_0->300 anon_285_363_0->363 anon_355_364_0->364

In [15]:
# locate haplotypes with resistance alleles
pos_995S = 2422651
pos_995F = 2422652
pos_1527T = 2429617
pos_490I = 2400071
loc_995S = h_vgsc[pos_vgsc.locate_key(pos_995S)] == 1
loc_995F = h_vgsc[pos_vgsc.locate_key(pos_995F)] == 1
loc_1527T = h_vgsc[pos_vgsc.locate_key(pos_1527T)] == 1
loc_490I = h_vgsc[pos_vgsc.locate_key(pos_490I)] == 1

In [16]:
pop_995F = df_haplotypes.population.iloc[loc_995F]
pop_995F.head()


Out[16]:
index
0    BFS
1    BFS
3    BFM
4    BFM
5    BFM
Name: population, dtype: object

In [17]:
pop_995F.value_counts()


Out[17]:
CMS    291
BFS    162
BFM    117
AOM    103
GNS     62
GAS     40
Name: population, dtype: int64

In [18]:
pop_995S = df_haplotypes.population.iloc[loc_995S]
pop_995S.head()


Out[18]:
index
300    UGS
301    UGS
302    UGS
303    UGS
304    UGS
Name: population, dtype: object

In [19]:
pop_995S.value_counts()


Out[19]:
UGS    206
CMS     85
GAS     72
KES     67
Name: population, dtype: int64

In [20]:
pop_1527T = df_haplotypes.population.iloc[loc_1527T]
pop_1527T.value_counts()


Out[20]:
BFM    19
Name: population, dtype: int64

In [21]:
pop_490I = df_haplotypes.population.iloc[loc_490I]
pop_490I.value_counts()


Out[21]:
KES    16
Name: population, dtype: int64

In [22]:
np.count_nonzero(loc_995F), np.count_nonzero(loc_995S), np.count_nonzero(loc_1527T), np.count_nonzero(loc_490I)


Out[22]:
(775, 430, 19, 16)

In [23]:
# resistance haps
h_vgsc_995F = h_vgsc.compress(loc_995F, axis=1)
h_vgsc_995S = h_vgsc.compress(loc_995S, axis=1)
h_vgsc_1527T = h_vgsc.compress(loc_1527T, axis=1)
h_vgsc_490I = h_vgsc.compress(loc_490I, axis=1)

In [24]:
# colours
hap_colors_995F = hap_colors.compress(loc_995F)
hap_colors_995S = hap_colors.compress(loc_995S)
hap_colors_1527T = hap_colors.compress(loc_1527T)
hap_colors_490I = hap_colors.compress(loc_490I)

In [25]:
# original haplotype indices - will need these for cluster correspondence
hap_ixs_995F = hap_ixs.compress(loc_995F)
hap_ixs_995S = hap_ixs.compress(loc_995S)
hap_ixs_1527T = hap_ixs.compress(loc_1527T)
hap_ixs_490I = hap_ixs.compress(loc_490I)

995F networks


In [26]:
graph_995F, h_distinct_sets_995F, components_995F = hapclust.graph_haplotype_network(
    h_vgsc_995F, 
    network_method='mjn', 
    max_dist=2, 
    hap_colors=hap_colors_995F, 
    variant_labels=variant_labels,
    return_components=True,  # new option, N.B., changes return values
    show_singletons=False,  # new option, don't show unconnected singleton nodes, makes graph less noisy
)
graph_995F.format = 'svg'
fn = '../artwork/995F_clusters_mjn_maxdist2'
graph_995F.render(fn)
graph_995F


Out[26]:
%3 0 3 0->3 1 12 1->12 19 1->19 23 1->23 24 1->24 2 33 2->33 anon_2_16_0 2->anon_2_16_0 9 3->9 10 3->10 27 3->27 46 3->46 I829L 77 3->77 78 3->78 95 3->95 122 3->122 4 34 4->34 38 4->38 40 4->40 41 4->41 5 11 5->11 121 5->121 6 6->1 I1868T 6->2 A1934V 6->3 N1570Y 6->4 P1874S 6->5 P1874L 7 6->7 K1603T 14 6->14 I1940T 15 6->15 T791M 18 6->18 N1345I 20 6->20 E1597G 22 6->22 V1853I 26 6->26 P55L 31 6->31 V1303A 97 6->97 98 6->98 103 6->103 104 6->104 anon_6_44_0 6->anon_6_44_0 8 13 8->13 17 14->17 M704V 29 14->29 15->13 A1746S anon_15_21_0 15->anon_15_21_0 16 32 20->32 35 20->35 21 28 30 28->30 N1570Y 44 45 50 45->50 51 45->51 57 45->57 61 45->61 66 45->66 70 45->70 72 45->72 74 45->74 80 45->80 81 45->81 88 45->88 93 45->93 96 45->96 A837V 116 45->116 anon_45_83_0 45->anon_45_83_0 47 82 47->82 87 47->87 90 47->90 48 53 48->53 G317S 64 48->64 anon_47_48_0 48->anon_47_48_0 I1940T 49 56 49->56 52 54 52->54 52->56 F1920S 67 52->67 115 52->115 R537L 117 52->117 H1755Y 118 52->118 120 52->120 K251R 55 91 55->91 55->98 anon_55_76_0 55->anon_55_76_0 119 56->119 58 58->93 59 71 59->71 anon_59_85_0 59->anon_59_85_0 75 76 75->76 79 79->45 R254K 83 85 98->9 N1570Y 98->11 P1874L 98->33 A1934V anon_2_16_0->16 anon_6_44_0->44 P940H anon_15_21_0->21 L1366F anon_45_83_0->83 anon_47_48_0->47 D466H anon_55_76_0->76 anon_59_85_0->85

In [27]:
# this tells you, for each *distinct* haplotype (i.e., node in the graph), 
# which "connected component" (i.e., cluster) it belongs to
components_995F


Out[27]:
array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  2,  0,  2,  0,  0,  0,
        0,  0,  3,  4,  0,  5,  0,  0,  6,  7,  0,  8,  0,  9,  9, 10,  8,
        8, 10,  9, 10,  0, 10,  8,  8, 11, 12,  8, 13, 14,  9, 15,  8, 10,
       16, 17,  8, 11,  8, 18,  8,  0,  0,  0,  0,  8,  8,  8,  9,  8, 19,
       11, 20,  9,  8, 21,  9,  0, 22,  8, 23,  0,  8,  0,  0, 24, 25, 26,
       27,  0,  0, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 10,  8, 10, 10,
       10, 10,  0,  0], dtype=int32)

In [28]:
# this tells you the number of distinct haplotypes for each connected component
# e.g., component 0 has 51 *distinct* haplotypes (= 51 nodes in the graph)
# e.g., component 8 has 18 *distinct* haplotypes (= 18 nodes in the graph)
np.bincount(components_995F)


Out[28]:
array([51,  1,  2,  1,  1,  1,  1,  1, 18,  7, 10,  3,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1])

In [29]:
def identify_components(h_distinct_sets, components):
    """This function is designed to collect all indices for original haplotypes 
    within each connected component. I.e., it finds clusters from the network."""
    clusters = []
    for c in np.unique(components):
        cluster = set()
        for i in np.nonzero(components == c)[0]:
            cluster |= h_distinct_sets[i]
        clusters.append(sorted(cluster))
    return clusters

In [30]:
network_clusters_995F = identify_components(h_distinct_sets_995F, components_995F)
len(network_clusters_995F)


Out[30]:
38

In [31]:
len(network_clusters_995F[0])


Out[31]:
456

In [32]:
# load up hierarchical clustering results from previous analysis
hierarchical_cluster_membership = np.load('../data/hierarchical_cluster_membership.npy').astype('U')
hierarchical_clusters = {('other' if not k else k): set(np.nonzero(hierarchical_cluster_membership == k)[0]) 
                         for k in np.unique(hierarchical_cluster_membership)}
for k in sorted(hierarchical_clusters):
    print(repr(k), len(hierarchical_clusters[k]))


'F1' 453
'F2' 14
'F3' 38
'F4' 42
'F5' 196
'S1' 108
'S2' 79
'S3' 165
'S4' 37
'S5' 36
'other' 362

In [33]:
len(hierarchical_clusters['F1'])


Out[33]:
453

In [34]:
# setup to quantify overall concordance
network_cluster_membership = np.full_like(hierarchical_cluster_membership, fill_value='')
network_cluster_membership[loc_995F] = 'FX'
network_cluster_membership[loc_995S] = 'SX'

In [35]:
def find_concordance_995F(nc_ix, hc):
    
    # network cluster, mapped back to original haplotype indices
    network_cluster = set(hap_ixs_995F.take(network_clusters_995F[nc_ix]))
    
    # find intersection
    hierarchical_cluster = hierarchical_clusters[hc]
    n_isec = len(hierarchical_cluster.intersection(network_cluster))

    # outputs
    print(repr(hc), n_isec, len(hierarchical_cluster), len(network_cluster), 
          '{:.1f}% concordance'.format(n_isec * 100 / len(hierarchical_cluster)))
    
    # assign
    network_cluster_membership[list(network_cluster)] = hc

In [36]:
# find the network components with more than 2 haplotypes
for i, l in enumerate(network_clusters_995F):
    if len(l) > 2:
        print(i, len(l))


0 456
8 187
9 50
10 35
11 13

In [37]:
pop_995F.iloc[network_clusters_995F[0]].value_counts()


Out[37]:
BFS    161
BFM    110
AOM     89
GNS     62
CMS     34
Name: population, dtype: int64

In [38]:
find_concordance_995F(0, 'F1')


'F1' 448 453 456 98.9% concordance

In [39]:
pop_995F.iloc[network_clusters_995F[8]].value_counts()


Out[39]:
CMS    163
GAS     24
Name: population, dtype: int64

In [40]:
find_concordance_995F(8, 'F5')


'F5' 187 196 187 95.4% concordance

In [41]:
find_concordance_995F(10, 'F4')


'F4' 35 42 35 83.3% concordance

In [42]:
find_concordance_995F(9, 'F3')


'F3' 37 38 50 97.4% concordance

In [43]:
find_concordance_995F(11, 'F2')


'F2' 13 14 13 92.9% concordance

In [44]:
# compute nucleotide diversity within clusters
is_accessible = phase1_ar3.accessibility['2L/is_accessible'][:]
is_accessible_vgsc = is_accessible[region_vgsc.loc0]
n_bp_accessible_vgsc = np.count_nonzero(is_accessible_vgsc)
n_bp_accessible_vgsc


Out[44]:
32966

In [45]:
def compute_pi_995F():
    for i, s in enumerate(network_clusters_995F):
        if len(s) >= 3:
            h = h_vgsc_995F.take(s, axis=1)
            ac = h.count_alleles(max_allele=1)
            mpd = allel.mean_pairwise_difference(ac)
            pi = np.sum(mpd) / n_bp_accessible_vgsc
            print(i, pi)
            
compute_pi_995F()


0 5.09568598062e-05
8 6.80258218903e-06
9 3.52125318974e-05
10 3.10990136514e-05
11 1.40004386804e-05

995S networks


In [46]:
graph_995S, h_distinct_sets_995S, components_995S = hapclust.graph_haplotype_network(
    h_vgsc_995S, 
    network_method='mjn', 
    max_dist=2, 
    hap_colors=hap_colors_995S, 
    variant_labels=variant_labels,
    return_components=True,  # new option, N.B., changes return values
    show_singletons=False,  # new option, don't show unconnected singleton nodes, makes graph less noisy
)
graph_995S.format = 'svg'
fn = '../artwork/995S_clusters_mjn_maxdist2'
graph_995S.render(fn)
graph_995S


Out[46]:
%3 0 1 0->1 3 1->3 5 1->5 7 1->7 I492N 9 1->9 10 1->10 11 1->11 12 1->12 14 1->14 18 1->18 M925L 20 1->20 22 1->22 anon_1_6_0 1->anon_1_6_0 2 4 2->4 13 2->13 17 2->17 F1529L 19 2->19 23 2->23 27 2->27 28 2->28 anon_2_24_0 2->anon_2_24_0 anon_2_26_0 2->anon_2_26_0 21 4->21 anon_4_16_0 4->anon_4_16_0 6 8 8->9 8->12 16 25 23->25 29 23->29 24 26 30 40 30->40 43 30->43 47 30->47 32 38 32->38 39 32->39 41 32->41 S2076T 44 32->44 45 32->45 L1617F 33 anon_33_36_0 33->anon_33_36_0 35 35->38 35->44 36 37 50 37->50 54 37->54 55 37->55 56 37->56 57 37->57 59 37->59 60 37->60 K251R 48 41->48 49 41->49 42 44->48 S2076T 51 50->51 53 50->53 53->55 anon_1_6_0->6 anon_2_24_0->24 anon_2_26_0->26 anon_4_16_0->16 anon_33_36_0->36

In [47]:
network_clusters_995S = identify_components(h_distinct_sets_995S, components_995S)
for i, l in enumerate(network_clusters_995S):
    if len(l) > 2:
        print(i, len(l))


0 107
1 165
3 36
5 34
8 78

In [48]:
def find_concordance_995S(nc_ix, hc):
    
    # network cluster, mapped back to original haplotype indices
    network_cluster = set(hap_ixs_995S.take(network_clusters_995S[nc_ix]))
    
    # find intersection
    hierarchical_cluster = hierarchical_clusters[hc]
    n_isec = len(hierarchical_cluster.intersection(network_cluster))

    # outputs
    print(repr(hc), n_isec, len(hierarchical_cluster), len(network_cluster), 
          '{:.1f}% concordance'.format(n_isec * 100 / len(hierarchical_cluster)))
    
    # assign
    network_cluster_membership[list(network_cluster)] = hc

In [49]:
find_concordance_995S(0, 'S1')


'S1' 107 108 107 99.1% concordance

In [50]:
find_concordance_995S(1, 'S3')


'S3' 165 165 165 100.0% concordance

In [51]:
find_concordance_995S(3, 'S4')


'S4' 36 37 36 97.3% concordance

In [52]:
find_concordance_995S(5, 'S5')


'S5' 34 36 34 94.4% concordance

In [53]:
find_concordance_995S(8, 'S2')


'S2' 78 79 78 98.7% concordance

In [54]:
# overall concordance
# for the comparison we ignore the FX and SX classes which are not output from the hierarchical clustering
network_cluster_membership_compare = network_cluster_membership.copy()
network_cluster_membership_compare[network_cluster_membership == 'FX'] = ''
network_cluster_membership_compare[network_cluster_membership == 'SX'] = ''
np.count_nonzero(hierarchical_cluster_membership == network_cluster_membership_compare) * 100 / len(hierarchical_cluster_membership)


Out[54]:
96.79738562091504

In [55]:
np.unique(hierarchical_cluster_membership)


Out[55]:
array(['', 'F1', 'F2', 'F3', 'F4', 'F5', 'S1', 'S2', 'S3', 'S4', 'S5'],
      dtype='<U2')

In [56]:
np.unique(network_cluster_membership)


Out[56]:
array(['', 'F1', 'F2', 'F3', 'F4', 'F5', 'FX', 'S1', 'S2', 'S3', 'S4',
       'S5', 'SX'],
      dtype='<U2')

In [57]:
np.unique(network_cluster_membership_compare)


Out[57]:
array(['', 'F1', 'F2', 'F3', 'F4', 'F5', 'S1', 'S2', 'S3', 'S4', 'S5'],
      dtype='<U2')

In [58]:
def compute_pi_995S():
    for i, s in enumerate(network_clusters_995S):
        if len(s) >= 3:
            h = h_vgsc_995S.take(s, axis=1)
            ac = h.count_alleles(max_allele=1)
            mpd = allel.mean_pairwise_difference(ac)
            pi = np.sum(mpd) / n_bp_accessible_vgsc
            print(i, pi)
            
compute_pi_995S()


0 1.38004676818e-05
1 1.43084552298e-05
3 5.05571396793e-06
5 2.57381802004e-05
8 1.97379921945e-05

1527T


In [59]:
graph_1527T, h_distinct_sets_1527T, components_1527T = hapclust.graph_haplotype_network(
    h_vgsc_1527T, 
    network_method='mjn', 
    max_dist=15, 
    hap_colors=hap_colors_1527T, 
    variant_labels=variant_labels,
    return_components=True,  # new option, N.B., changes return values
    show_singletons=True,  # new option, don't show unconnected singleton nodes, makes graph less noisy
)
graph_1527T.format = 'svg'
fn = '../artwork/1527_clusters_mjn_maxdist2'
graph_1527T.render(fn)
graph_1527T


Out[59]:
%3 0 1 0->1 anon_1_10_0 1->anon_1_10_0 2 3 2->3 5 3->5 anon_3_4_0 3->anon_3_4_0 anon_3_6_0 3->anon_3_6_0 4 6 7 8 9 10 11 anon_1_10_0->10 anon_3_4_0->4 anon_3_6_1 anon_3_6_0->anon_3_6_1 anon_3_6_2 anon_3_6_1->anon_3_6_2 anon_3_6_3 anon_3_6_2->anon_3_6_3 anon_3_6_3->6

In [60]:
variant_labels[838]


Out[60]:
'V402L'

In [61]:
network_clusters_1527T = identify_components(h_distinct_sets_1527T, components_1527T)
for i, l in enumerate(network_clusters_1527T):
    if len(l) > 0:
        print(i, len(l))


0 8
1 7
2 1
3 1
4 1
5 1

In [62]:
h_vgsc_1527T[838]


Out[62]:
array([2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 0, 2, 1, 1, 2, 0, 2, 1], dtype=int8)

In [63]:
h_vgsc_1527T[838, network_clusters_1527T[0]]


Out[63]:
array([2, 2, 2, 2, 2, 2, 2, 2], dtype=int8)

In [64]:
h_vgsc_1527T[838, network_clusters_1527T[1]]


Out[64]:
array([1, 1, 1, 1, 1, 1, 1], dtype=int8)

In [65]:
h_vgsc_1527T[838, network_clusters_1527T[2]]


Out[65]:
array([0], dtype=int8)

In [66]:
h_vgsc_1527T[838, network_clusters_1527T[3]]


Out[66]:
array([2], dtype=int8)

In [67]:
h_vgsc_1527T[838, network_clusters_1527T[4]]


Out[67]:
array([0], dtype=int8)

In [68]:
h_vgsc_1527T[838, network_clusters_1527T[5]]


Out[68]:
array([1], dtype=int8)

So two main network components, one for each of the two V402L alternate alleles.


In [69]:
network_cluster_membership[loc_1527T] = 'L1'

490I


In [70]:
graph_490I, h_distinct_sets_490I, components_490I = hapclust.graph_haplotype_network(
    h_vgsc_490I, 
    network_method='mjn', 
    max_dist=15, 
    hap_colors=hap_colors_490I, 
    variant_labels=variant_labels,
    return_components=True,  # new option, N.B., changes return values
    show_singletons=True,  # new option, don't show unconnected singleton nodes, makes graph less noisy
)
graph_490I.format = 'svg'
fn = '../artwork/490_clusters_mjn_maxdist2'
graph_490I.render(fn)
graph_490I


Out[70]:
%3 0 1 0->1 2 0->2 anon_0_3_0 0->anon_0_3_0 3 anon_0_3_0->3

In [71]:
network_cluster_membership[loc_490I] = 'L2'

In [72]:
np.save('../data/median_joining_network_membership.npy', network_cluster_membership)

In [73]:
collections.Counter(network_cluster_membership).most_common()


Out[73]:
[('F1', 456),
 ('', 291),
 ('F5', 187),
 ('S3', 165),
 ('S1', 107),
 ('S2', 78),
 ('F3', 50),
 ('S4', 36),
 ('F4', 35),
 ('S5', 34),
 ('FX', 33),
 ('L1', 19),
 ('L2', 16),
 ('F2', 13),
 ('SX', 10)]

Wt diversity


In [74]:
def compute_wt_diversity(pop):

    loc_pop = (df_haplotypes.population == pop).values 
    loc_pop.shape

    h_vgsc_wt_pop = h_vgsc.compress(~loc_995S & ~loc_995F & ~loc_1527T & loc_pop, axis=1)

    ac = h_vgsc_wt_pop.count_alleles(max_allele=1)
    mpd = allel.mean_pairwise_difference(ac)
    pi = np.sum(mpd) / n_bp_accessible_vgsc

    return h_vgsc_wt_pop.shape, pi

In [75]:
compute_wt_diversity('CMS')


Out[75]:
((1713, 174), 0.0013963001235142061)

In [76]:
compute_wt_diversity('GWA')


Out[76]:
((1713, 92), 0.0057312520429987964)

In [77]:
compute_wt_diversity('BFM')


Out[77]:
((1713, 3), 0.0031749883718578736)

In [78]:
compute_wt_diversity('AOM')


Out[78]:
((1713, 17), 0.0028938014567593704)