Symmetric Tree Dataset

This dataset intends to create a biforcating tree with 8 samples, three runs per sample.


In [ ]:
import utils
import gzip
import random
import string
import math
import ete3 as ete
from skbio import Alignment, DNA, DistanceMatrix
from skbio.tree import nj
import numpy as np
import skbio
import sys

In [ ]:
seed = 1003
genome_size = 1 # mbp
num_samples = 8
num_runs = 3

mean_n_reads = 5e5
sd_n_reads = mean_n_reads * 0.1 # Coeff of Var = 0.1
min_n_reads = mean_n_reads / 100.0

countgraph_size = 1e8

Don't edit below here

Constants are all in the cell above this


In [ ]:
! rm -rf data 
! mkdir data
for subdir  in ['genomes', 'fastq', 'countgraphs']:
    ! mkdir data/{subdir}

Set a random seed and seed the RNG


In [ ]:
random.seed(seed)
utils.random.seed(seed)

Generate sample genomes

First, we make a tree with the following structure


In [ ]:
tree = '(((A:0.12,B:0.14):0.08,(C:0.17,D:0.12):0.19):0.13,((E:0.11,F:0.12):0.15,(G:0.12,H:0.17):0.12):0.11);'

In [ ]:
print(ete.Tree(tree))

In [ ]:
with open("data/sym_sample_truth.nwk", 'w') as fh:
    print(tree, file=fh)

Make genome sequences with seq-gen

Using the GTR model of sequence evolution


In [ ]:
seqgen = 'seq-gen -mGTR -s0.1 -l{len} < data/sym_sample_truth.nwk >data/sym_genomes.phy'.format(len=int(genome_size*1e6))

In [ ]:
utils.run_cmd(seqgen)

Make a random genome, and samples derived from it. Write it to a fasta file.


In [ ]:
seqs = []
with open("data/sym_genomes.phy") as fh:
    next(fh)  # nuke first line
    for line in fh:
        name, seq = line.strip().split()
        seqs.append(skbio.Sequence(seq, {'id': name}))

In [ ]:
aln = skbio.Alignment(seqs)

Make NJ tree


In [ ]:
distmat = aln.distances()
distmat

Make a repeated version of this distance matrix, which can be directly (visually) compared with the result of kWIP, which will of course be of the runs, not the genomes themselves.


In [ ]:
distmat_reps = DistanceMatrix(
    np.repeat(np.repeat(distmat.data,num_runs, axis=1), num_runs, axis=0))
run_names = ['{}-{}'.format(g, i) for g in distmat.ids for i in range(num_runs)]

In [ ]:
distmat_reps.ids = run_names
distmat_reps

In [ ]:
tree_reps = nj(distmat_reps)
tree_reps.write('data/sym_runs.nwk')

Generate reads


In [ ]:
runs = {}
run_read_counts = {}

for seq in seqs:
    genome = seq.metadata['id']
    print('Genome', genome, end=', reps: ')
    runs[genome] = []
    
    # write genome
    fas = 'data/genomes/sym_{}.fasta'.format(genome)
    seq.write(fas, format='fasta')
    
    # create each run's reads
    for j in range(num_runs):
        print(j, end=' ')
        sys.stdout.flush()
        run = '{}-{}'.format(genome, j)
        fq = "data/fastq/sym_{}_il.fq".format(run)
        n_reads = max(int(random.gauss(mean_n_reads, sd_n_reads)), min_n_reads)
        utils.wgsim(n_reads, fas, fq)
        runs[genome].append(fq)
        run_read_counts[run] = n_reads
    print()

Hash samples


In [ ]:
def countgraph(fq, cg, x=1e9, k=20, n=1, quiet=True):
    lic = "load-into-countgraph.py -T 12 -N {N} -k {k} -x {x} -s tsv -b {cg} {fq}".format(
            N=n, k=k, x=x, cg=cg, fq=fq)
    print(lic)
    sys.stdout.flush()
    utils.run_cmd(lic, quiet)

In [ ]:
countgraphs = []
for genome in runs:
    for i, fq in enumerate(runs[genome]):
        cg = 'data/countgraphs/bifork_{}-{}.cg.gz'.format(genome, i)
        countgraphs.append(cg)
        countgraph(fq, cg, x=countgraph_size, k=20)

Run kWIP


In [ ]:
def kwip(countgraphs, dist, kern='', weighted=True, quiet=True):
    if kern:
        kern = '-k {kern}'.format(kern=kern)
    unweight = ''  
    if not weighted:
        unweight = '-U'        
    cgs = ' '.join(countgraphs)
    cmd = "kwip {kern} {wht} -d {dist} {cgs}".format(wht=unweight, kern=kern, dist=dist, cgs=cgs)
    print(cmd)
    utils.run_cmd(cmd, quiet)

In [ ]:
kwip(sorted(countgraphs), 'data/asym-kwip.dist', 'data/asym-kwip.kern')

In [ ]:
kwip(sorted(countgraphs), 'data/asym-ip.dist', 'data/asym-ip.kern', weighted=False)

Analyse the output


In [ ]:
from skbio import DistanceMatrix
from skbio.tree import nj

Make a tree from kWIP's output


In [ ]:
kwip_dist = DistanceMatrix.read("data/asym-kwip.dist")
ip_dist = DistanceMatrix.read("data/asym-ip.dist")
kwip_dist.ids = run_names
ip_dist.ids = run_names

In [ ]:
kwip_dist.plot(title="kWIP Distances")
distmat_reps.ids = run_names
distmat_reps.plot(title='True Distances')
ip_dist.plot(title='Unweighted kWIP Distance')
print() #

In [ ]:
kwip_tree = nj(kwip_dist)
kwip_tree.write('data/asym_kwip.nwk')

In [ ]:
ip_tree = nj(ip_dist)
ip_tree.write('data/asym_ip.nwk')

Robinson-Foulds distance

A measure of tree concordance. Smaller is better


In [ ]:
true_tree = ete.Tree("data/asym_runs.nwk")
print(true_tree)
kwip_tree = ete.Tree("data/asym_kwip.nwk")
print(kwip_tree)
ip_tree = ete.Tree("data/asym_ip.nwk")
print(ip_tree)

And the RF distance is....


In [ ]:
print('kWIP:', kwip_tree.robinson_foulds(true_tree, unrooted_trees=True)[0])
print('Unweighted:', ip_tree.robinson_foulds(true_tree, unrooted_trees=True)[0])

Hierarchical clustering

And ploting by matplotlib


In [ ]:
from scipy.cluster import hierarchy as hier
import matplotlib.pyplot as plt

In [ ]:
%matplotlib inline

In [ ]:
z = hier.linkage(kwip_dist.condensed_form(), method='complete')

In [ ]:
plt.figure(figsize=(10, 10))
x = hier.dendrogram(z, labels=run_names)

In [ ]: