Biforcating Tree Dataset

This dataset intends to create a biforcating tree with 8 samples, three runs per sample.


In [ ]:
import utils
import gzip
import random
import string
import math
import ete3 as ete
import matplotlib.pyplot as plt

In [ ]:
%matplotlib inline

In [ ]:
seed = 1003
genome_size = 1 # mbp
num_samples = 8
num_runs = 3

mean_n_reads = 5e5
sd_n_reads = mean_n_reads * 0.1 # Coeff of Var = 0.1
min_n_reads = mean_n_reads / 100.0

countgraph_size = 1e8

Don't edit below here

Constants are all in the cell above this


In [ ]:
! rm -rf data 
! mkdir data
for subdir  in ['genomes', 'fastq', 'countgraphs']:
    ! mkdir data/{subdir}

Set a random seed and seed the RNG


In [ ]:
random.seed(seed)
utils.random.seed(seed)

In [ ]:
genome = utils.make_rand_genome(mbp=genome_size)

In [ ]:
levels = int(math.ceil(math.log2(num_samples)))

In [ ]:
bifork = utils.biforcating_sequences(genome, levels=levels, av_rate=0.0001, sd_rate=0.00001)
seqlist = list(utils.flatten(bifork))

In [ ]:
with open("data/bifork.fas", 'w') as fh:
    utils.print_multifasta(seqlist, file=fh)

Make NJ tree


In [ ]:
from skbio import Alignment, DNA, DistanceMatrix
from skbio.tree import nj
import numpy as np

In [ ]:
aln = Alignment.read('data/bifork.fas')

In [ ]:
distmat = aln.distances()

Make a repeated version of this distance matrix, which can be directly (visually) compared with the result of kWIP, which will of course be of the runs, not the genomes themselves.


In [ ]:
distmat_reps = DistanceMatrix(
    np.repeat(np.repeat(distmat.data,num_runs, axis=1), num_runs, axis=0))

In [ ]:
distmat_reps

In [ ]:
tree_reps = nj(distmat_reps)
tree_reps.write('data/bifork_runs.nwk')

Generate reads


In [ ]:
genomes = {}
runs = []
r2g = {}
for i, seq in enumerate(seqlist):
    genome = string.ascii_uppercase[i]
    print('Genome', genome, end=', reps: ')
    genomes[genome] = []
    
    # write genome
    fas = 'data/genomes/bifork_{}.fasta'.format(genome)
    with open(fas, 'wb') as fh:
        fh.write(">{}\n{}\n".format(genome, seq).encode('ascii'))
    
    # create each run
    for j in range(num_runs):
        print(j, end=' ')
        fq = "data/fastq/bifork_{}-{}_il.fq".format(genome, j)
        n_reads = max(int(random.gauss(mean_n_reads, sd_n_reads)), min_n_reads)
        utils.wgsim(n_reads, fas, fq)
        genomes[genome].append(fq)
        runs.append(fq)
        r2g[fq] = genome
    print()

Hash samples


In [ ]:
def countgraph(fq, cg, x=1e9, k=20, n=1, quiet=True):
    lic = "load-into-countgraph.py -T 8 -N {N} -k {k} -x {x} -s tsv -b {cg} {fq}".format(
            N=n, k=k, x=x, cg=cg, fq=fq)
    print(lic)
    utils.run_cmd(lic, quiet)

In [ ]:
countgraphs = []
for genome in genomes:
    for i, fq in enumerate(genomes[genome]):
        cg = 'data/countgraphs/bifork_{}-{}.cg'.format(genome, i)
        countgraphs.append(cg)
        countgraph(fq, cg, x=countgraph_size, k=20)

Run kWIP


In [ ]:
def kwip(countgraphs, dist, kern='', weighted=True, quiet=True):
    if kern:
        kern = '-k {kern}'.format(kern=kern)
    unweight = ''  
    if not weighted:
        unweight = '-U'        
    cgs = ' '.join(countgraphs)
    cmd = "kwip {kern} {wht} -d {dist} {cgs}".format(wht=unweight, kern=kern, dist=dist, cgs=cgs)
    print(cmd)
    utils.run_cmd(cmd, quiet)

In [ ]:
kwip(sorted(countgraphs), 'data/bifork-kwip.dist', 'data/bifork-kwip.kern')

In [ ]:
kwip(sorted(countgraphs), 'data/bifork-ip.dist', 'data/bifork-ip.kern', weighted=False)

Analyse the output


In [ ]:
from skbio import DistanceMatrix
from skbio.tree import nj

Make a tree from kWIP's output


In [ ]:
kwip_dist = DistanceMatrix.read("data/bifork-kwip.dist")
kwip_dist.ids = [x.split('_')[1] for x in kwip_dist.ids]
distmat_reps.ids = kwip_dist.ids
ip_dist = DistanceMatrix.read("data/bifork-ip.dist")
ip_dist.ids = kwip_dist.ids

In [ ]:
kwip_dist.plot(title='kWIP Distances')
plt.xlim(0, 24)
plt.ylim(0, 24)
plt.savefig('data/kwip-mat.png', dpi=320)
distmat_reps.plot(title='True Distances')
plt.xlim(0, 24)
plt.ylim(0, 24)
plt.savefig('data/true-mat.png', dpi=320)
print()

In [ ]:
ip_dist.plot(title='Unweighted IP Distances')
plt.xlim(0, 24)
plt.ylim(0, 24)
plt.savefig('data/ip-mat.png', dpi=320)

In [ ]:
kwip_tree = nj(kwip_dist)
kwip_tree.write('data/bifork_kwip.nwk')

Robinson-Foulds distance

A measure of tree concordance. Smaller is better


In [ ]:
true_tree = ete.Tree("data/bifork_runs.nwk")
kwip_tree = ete.Tree("data/bifork_kwip.nwk")

And the RF distance is....


In [ ]:
kwip_tree.robinson_foulds(true_tree, unrooted_trees=True)[0]

Hierarchical clustering

And ploting by matplotlib


In [ ]:
from scipy.cluster import hierarchy as hier

In [ ]:
plt.figure(figsize=(10,4))
z = hier.linkage(kwip_dist.condensed_form(), method='complete')
x = hier.dendrogram(z, labels=kwip_dist.ids)
plt.title("kWIP Dendrogram")
plt.savefig('data/kwip-dendro.png')

In [ ]:
plt.figure(figsize=(10,4))
z = hier.linkage(distmat_reps.condensed_form(), method='complete')
x = hier.dendrogram(z, labels=distmat_reps.ids)
plt.title("True Dendrogram")
plt.savefig('data/true-dendro.png')