In [ ]:
import utils
import gzip
import random
import string
import math
import ete3 as ete
from skbio import Alignment, DNA, DistanceMatrix
from skbio.tree import nj
import numpy as np
import skbio
import sys
In [ ]:
seed = 1003
genome_size = 1 # mbp
num_samples = 8
num_runs = 3
mean_n_reads = 5e5
sd_n_reads = mean_n_reads * 0.1 # Coeff of Var = 0.1
min_n_reads = mean_n_reads / 100.0
countgraph_size = 1e8
In [ ]:
! rm -rf data
! mkdir data
for subdir in ['genomes', 'fastq', 'countgraphs']:
! mkdir data/{subdir}
Set a random seed and seed the RNG
In [ ]:
random.seed(seed)
utils.random.seed(seed)
In [ ]:
tree = '(((A:0.12,B:0.14):0.08,(C:0.17,D:0.12):0.19):0.13,((E:0.11,F:0.12):0.15,(G:0.12,H:0.17):0.12):0.11);'
In [ ]:
print(ete.Tree(tree))
In [ ]:
with open("data/sym_sample_truth.nwk", 'w') as fh:
print(tree, file=fh)
In [ ]:
seqgen = 'seq-gen -mGTR -s0.1 -l{len} < data/sym_sample_truth.nwk >data/sym_genomes.phy'.format(len=int(genome_size*1e6))
In [ ]:
utils.run_cmd(seqgen)
Make a random genome, and samples derived from it. Write it to a fasta file.
In [ ]:
seqs = []
with open("data/sym_genomes.phy") as fh:
next(fh) # nuke first line
for line in fh:
name, seq = line.strip().split()
seqs.append(skbio.Sequence(seq, {'id': name}))
In [ ]:
aln = skbio.Alignment(seqs)
In [ ]:
distmat = aln.distances()
distmat
Make a repeated version of this distance matrix, which can be directly (visually) compared with the result of kWIP, which will of course be of the runs, not the genomes themselves.
In [ ]:
distmat_reps = DistanceMatrix(
np.repeat(np.repeat(distmat.data,num_runs, axis=1), num_runs, axis=0))
run_names = ['{}-{}'.format(g, i) for g in distmat.ids for i in range(num_runs)]
In [ ]:
distmat_reps.ids = run_names
distmat_reps
In [ ]:
tree_reps = nj(distmat_reps)
tree_reps.write('data/sym_runs.nwk')
In [ ]:
runs = {}
run_read_counts = {}
for seq in seqs:
genome = seq.metadata['id']
print('Genome', genome, end=', reps: ')
runs[genome] = []
# write genome
fas = 'data/genomes/sym_{}.fasta'.format(genome)
seq.write(fas, format='fasta')
# create each run's reads
for j in range(num_runs):
print(j, end=' ')
sys.stdout.flush()
run = '{}-{}'.format(genome, j)
fq = "data/fastq/sym_{}_il.fq".format(run)
n_reads = max(int(random.gauss(mean_n_reads, sd_n_reads)), min_n_reads)
utils.wgsim(n_reads, fas, fq)
runs[genome].append(fq)
run_read_counts[run] = n_reads
print()
In [ ]:
def countgraph(fq, cg, x=1e9, k=20, n=1, quiet=True):
lic = "load-into-countgraph.py -T 12 -N {N} -k {k} -x {x} -s tsv -b {cg} {fq}".format(
N=n, k=k, x=x, cg=cg, fq=fq)
print(lic)
sys.stdout.flush()
utils.run_cmd(lic, quiet)
In [ ]:
countgraphs = []
for genome in runs:
for i, fq in enumerate(runs[genome]):
cg = 'data/countgraphs/bifork_{}-{}.cg.gz'.format(genome, i)
countgraphs.append(cg)
countgraph(fq, cg, x=countgraph_size, k=20)
In [ ]:
def kwip(countgraphs, dist, kern='', weighted=True, quiet=True):
if kern:
kern = '-k {kern}'.format(kern=kern)
unweight = ''
if not weighted:
unweight = '-U'
cgs = ' '.join(countgraphs)
cmd = "kwip {kern} {wht} -d {dist} {cgs}".format(wht=unweight, kern=kern, dist=dist, cgs=cgs)
print(cmd)
utils.run_cmd(cmd, quiet)
In [ ]:
kwip(sorted(countgraphs), 'data/asym-kwip.dist', 'data/asym-kwip.kern')
In [ ]:
kwip(sorted(countgraphs), 'data/asym-ip.dist', 'data/asym-ip.kern', weighted=False)
In [ ]:
from skbio import DistanceMatrix
from skbio.tree import nj
Make a tree from kWIP's output
In [ ]:
kwip_dist = DistanceMatrix.read("data/asym-kwip.dist")
ip_dist = DistanceMatrix.read("data/asym-ip.dist")
kwip_dist.ids = run_names
ip_dist.ids = run_names
In [ ]:
kwip_dist.plot(title="kWIP Distances")
distmat_reps.ids = run_names
distmat_reps.plot(title='True Distances')
ip_dist.plot(title='Unweighted kWIP Distance')
print() #
In [ ]:
kwip_tree = nj(kwip_dist)
kwip_tree.write('data/asym_kwip.nwk')
In [ ]:
ip_tree = nj(ip_dist)
ip_tree.write('data/asym_ip.nwk')
In [ ]:
true_tree = ete.Tree("data/asym_runs.nwk")
print(true_tree)
kwip_tree = ete.Tree("data/asym_kwip.nwk")
print(kwip_tree)
ip_tree = ete.Tree("data/asym_ip.nwk")
print(ip_tree)
And the RF distance is....
In [ ]:
print('kWIP:', kwip_tree.robinson_foulds(true_tree, unrooted_trees=True)[0])
print('Unweighted:', ip_tree.robinson_foulds(true_tree, unrooted_trees=True)[0])
In [ ]:
from scipy.cluster import hierarchy as hier
import matplotlib.pyplot as plt
In [ ]:
%matplotlib inline
In [ ]:
z = hier.linkage(kwip_dist.condensed_form(), method='complete')
In [ ]:
plt.figure(figsize=(10, 10))
x = hier.dendrogram(z, labels=run_names)
In [ ]: