Symmetric Tree Dataset

This dataset intends to create a biforcating tree with 8 samples, three runs per sample.



In [ ]:

    
import utils
import gzip
import random
import string
import math
import ete3 as ete
from skbio import Alignment, DNA, DistanceMatrix
from skbio.tree import nj
import numpy as np
import skbio
import sys



In [ ]:

    
seed = 1003
genome_size = 1 # mbp
num_samples = 8
num_runs = 3

mean_n_reads = 5e5
sd_n_reads = mean_n_reads * 0.1 # Coeff of Var = 0.1
min_n_reads = mean_n_reads / 100.0

countgraph_size = 1e8

Don't edit below here

Constants are all in the cell above this



In [ ]:

    
! rm -rf data 
! mkdir data
for subdir  in ['genomes', 'fastq', 'countgraphs']:
    ! mkdir data/{subdir}

Set a random seed and seed the RNG



In [ ]:

    
random.seed(seed)
utils.random.seed(seed)

Generate sample genomes

First, we make a tree with the following structure



In [ ]:

    
tree = '(((A:0.12,B:0.14):0.08,(C:0.17,D:0.12):0.19):0.13,((E:0.11,F:0.12):0.15,(G:0.12,H:0.17):0.12):0.11);'



In [ ]:

    
print(ete.Tree(tree))



In [ ]:

    
with open("data/sym_sample_truth.nwk", 'w') as fh:
    print(tree, file=fh)

Make genome sequences with seq-gen

Using the GTR model of sequence evolution



In [ ]:

    
seqgen = 'seq-gen -mGTR -s0.1 -l{len} < data/sym_sample_truth.nwk >data/sym_genomes.phy'.format(len=int(genome_size*1e6))



In [ ]:

    
utils.run_cmd(seqgen)

Make a random genome, and samples derived from it. Write it to a fasta file.



In [ ]:

    
seqs = []
with open("data/sym_genomes.phy") as fh:
    next(fh)  # nuke first line
    for line in fh:
        name, seq = line.strip().split()
        seqs.append(skbio.Sequence(seq, {'id': name}))



In [ ]:

    
aln = skbio.Alignment(seqs)

Make NJ tree



In [ ]:

    
distmat = aln.distances()
distmat

Make a repeated version of this distance matrix, which can be directly (visually) compared with the result of kWIP, which will of course be of the runs, not the genomes themselves.



In [ ]:

    
distmat_reps = DistanceMatrix(
    np.repeat(np.repeat(distmat.data,num_runs, axis=1), num_runs, axis=0))
run_names = ['{}-{}'.format(g, i) for g in distmat.ids for i in range(num_runs)]



In [ ]:

    
distmat_reps.ids = run_names
distmat_reps



In [ ]:

    
tree_reps = nj(distmat_reps)
tree_reps.write('data/sym_runs.nwk')

Generate reads



In [ ]:

    
runs = {}
run_read_counts = {}

for seq in seqs:
    genome = seq.metadata['id']
    print('Genome', genome, end=', reps: ')
    runs[genome] = []
    
    # write genome
    fas = 'data/genomes/sym_{}.fasta'.format(genome)
    seq.write(fas, format='fasta')
    
    # create each run's reads
    for j in range(num_runs):
        print(j, end=' ')
        sys.stdout.flush()
        run = '{}-{}'.format(genome, j)
        fq = "data/fastq/sym_{}_il.fq".format(run)
        n_reads = max(int(random.gauss(mean_n_reads, sd_n_reads)), min_n_reads)
        utils.wgsim(n_reads, fas, fq)
        runs[genome].append(fq)
        run_read_counts[run] = n_reads
    print()

Hash samples



In [ ]:

    
def countgraph(fq, cg, x=1e9, k=20, n=1, quiet=True):
    lic = "load-into-countgraph.py -T 12 -N {N} -k {k} -x {x} -s tsv -b {cg} {fq}".format(
            N=n, k=k, x=x, cg=cg, fq=fq)
    print(lic)
    sys.stdout.flush()
    utils.run_cmd(lic, quiet)



In [ ]:

    
countgraphs = []
for genome in runs:
    for i, fq in enumerate(runs[genome]):
        cg = 'data/countgraphs/bifork_{}-{}.cg.gz'.format(genome, i)
        countgraphs.append(cg)
        countgraph(fq, cg, x=countgraph_size, k=20)

Run `kWIP`



In [ ]:

    
def kwip(countgraphs, dist, kern='', weighted=True, quiet=True):
    if kern:
        kern = '-k {kern}'.format(kern=kern)
    unweight = ''  
    if not weighted:
        unweight = '-U'        
    cgs = ' '.join(countgraphs)
    cmd = "kwip {kern} {wht} -d {dist} {cgs}".format(wht=unweight, kern=kern, dist=dist, cgs=cgs)
    print(cmd)
    utils.run_cmd(cmd, quiet)



In [ ]:

    
kwip(sorted(countgraphs), 'data/asym-kwip.dist', 'data/asym-kwip.kern')



In [ ]:

    
kwip(sorted(countgraphs), 'data/asym-ip.dist', 'data/asym-ip.kern', weighted=False)

Analyse the output



In [ ]:

    
from skbio import DistanceMatrix
from skbio.tree import nj

Make a tree from kWIP's output



In [ ]:

    
kwip_dist = DistanceMatrix.read("data/asym-kwip.dist")
ip_dist = DistanceMatrix.read("data/asym-ip.dist")
kwip_dist.ids = run_names
ip_dist.ids = run_names



In [ ]:

    
kwip_dist.plot(title="kWIP Distances")
distmat_reps.ids = run_names
distmat_reps.plot(title='True Distances')
ip_dist.plot(title='Unweighted kWIP Distance')
print() #



In [ ]:

    
kwip_tree = nj(kwip_dist)
kwip_tree.write('data/asym_kwip.nwk')



In [ ]:

    
ip_tree = nj(ip_dist)
ip_tree.write('data/asym_ip.nwk')

Robinson-Foulds distance

A measure of tree concordance. Smaller is better



In [ ]:

    
true_tree = ete.Tree("data/asym_runs.nwk")
print(true_tree)
kwip_tree = ete.Tree("data/asym_kwip.nwk")
print(kwip_tree)
ip_tree = ete.Tree("data/asym_ip.nwk")
print(ip_tree)

And the RF distance is....



In [ ]:

    
print('kWIP:', kwip_tree.robinson_foulds(true_tree, unrooted_trees=True)[0])
print('Unweighted:', ip_tree.robinson_foulds(true_tree, unrooted_trees=True)[0])

Hierarchical clustering

And ploting by matplotlib



In [ ]:

    
from scipy.cluster import hierarchy as hier
import matplotlib.pyplot as plt



In [ ]:

    
%matplotlib inline



In [ ]:

    
z = hier.linkage(kwip_dist.condensed_form(), method='complete')



In [ ]:

    
plt.figure(figsize=(10, 10))
x = hier.dendrogram(z, labels=run_names)



In [ ]: