In [ ]:
import utils
import gzip
import random
import string
import math
import ete3 as ete
import matplotlib.pyplot as plt
In [ ]:
%matplotlib inline
In [ ]:
seed = 1003
genome_size = 1 # mbp
num_samples = 8
num_runs = 3
mean_n_reads = 5e5
sd_n_reads = mean_n_reads * 0.1 # Coeff of Var = 0.1
min_n_reads = mean_n_reads / 100.0
countgraph_size = 1e8
In [ ]:
! rm -rf data
! mkdir data
for subdir in ['genomes', 'fastq', 'countgraphs']:
! mkdir data/{subdir}
Set a random seed and seed the RNG
In [ ]:
random.seed(seed)
utils.random.seed(seed)
In [ ]:
genome = utils.make_rand_genome(mbp=genome_size)
In [ ]:
levels = int(math.ceil(math.log2(num_samples)))
In [ ]:
bifork = utils.biforcating_sequences(genome, levels=levels, av_rate=0.0001, sd_rate=0.00001)
seqlist = list(utils.flatten(bifork))
In [ ]:
with open("data/bifork.fas", 'w') as fh:
utils.print_multifasta(seqlist, file=fh)
In [ ]:
from skbio import Alignment, DNA, DistanceMatrix
from skbio.tree import nj
import numpy as np
In [ ]:
aln = Alignment.read('data/bifork.fas')
In [ ]:
distmat = aln.distances()
Make a repeated version of this distance matrix, which can be directly (visually) compared with the result of kWIP, which will of course be of the runs, not the genomes themselves.
In [ ]:
distmat_reps = DistanceMatrix(
np.repeat(np.repeat(distmat.data,num_runs, axis=1), num_runs, axis=0))
In [ ]:
distmat_reps
In [ ]:
tree_reps = nj(distmat_reps)
tree_reps.write('data/bifork_runs.nwk')
In [ ]:
genomes = {}
runs = []
r2g = {}
for i, seq in enumerate(seqlist):
genome = string.ascii_uppercase[i]
print('Genome', genome, end=', reps: ')
genomes[genome] = []
# write genome
fas = 'data/genomes/bifork_{}.fasta'.format(genome)
with open(fas, 'wb') as fh:
fh.write(">{}\n{}\n".format(genome, seq).encode('ascii'))
# create each run
for j in range(num_runs):
print(j, end=' ')
fq = "data/fastq/bifork_{}-{}_il.fq".format(genome, j)
n_reads = max(int(random.gauss(mean_n_reads, sd_n_reads)), min_n_reads)
utils.wgsim(n_reads, fas, fq)
genomes[genome].append(fq)
runs.append(fq)
r2g[fq] = genome
print()
In [ ]:
def countgraph(fq, cg, x=1e9, k=20, n=1, quiet=True):
lic = "load-into-countgraph.py -T 8 -N {N} -k {k} -x {x} -s tsv -b {cg} {fq}".format(
N=n, k=k, x=x, cg=cg, fq=fq)
print(lic)
utils.run_cmd(lic, quiet)
In [ ]:
countgraphs = []
for genome in genomes:
for i, fq in enumerate(genomes[genome]):
cg = 'data/countgraphs/bifork_{}-{}.cg'.format(genome, i)
countgraphs.append(cg)
countgraph(fq, cg, x=countgraph_size, k=20)
In [ ]:
def kwip(countgraphs, dist, kern='', weighted=True, quiet=True):
if kern:
kern = '-k {kern}'.format(kern=kern)
unweight = ''
if not weighted:
unweight = '-U'
cgs = ' '.join(countgraphs)
cmd = "kwip {kern} {wht} -d {dist} {cgs}".format(wht=unweight, kern=kern, dist=dist, cgs=cgs)
print(cmd)
utils.run_cmd(cmd, quiet)
In [ ]:
kwip(sorted(countgraphs), 'data/bifork-kwip.dist', 'data/bifork-kwip.kern')
In [ ]:
kwip(sorted(countgraphs), 'data/bifork-ip.dist', 'data/bifork-ip.kern', weighted=False)
In [ ]:
from skbio import DistanceMatrix
from skbio.tree import nj
Make a tree from kWIP's output
In [ ]:
kwip_dist = DistanceMatrix.read("data/bifork-kwip.dist")
kwip_dist.ids = [x.split('_')[1] for x in kwip_dist.ids]
distmat_reps.ids = kwip_dist.ids
ip_dist = DistanceMatrix.read("data/bifork-ip.dist")
ip_dist.ids = kwip_dist.ids
In [ ]:
kwip_dist.plot(title='kWIP Distances')
plt.xlim(0, 24)
plt.ylim(0, 24)
plt.savefig('data/kwip-mat.png', dpi=320)
distmat_reps.plot(title='True Distances')
plt.xlim(0, 24)
plt.ylim(0, 24)
plt.savefig('data/true-mat.png', dpi=320)
print()
In [ ]:
ip_dist.plot(title='Unweighted IP Distances')
plt.xlim(0, 24)
plt.ylim(0, 24)
plt.savefig('data/ip-mat.png', dpi=320)
In [ ]:
kwip_tree = nj(kwip_dist)
kwip_tree.write('data/bifork_kwip.nwk')
In [ ]:
true_tree = ete.Tree("data/bifork_runs.nwk")
kwip_tree = ete.Tree("data/bifork_kwip.nwk")
And the RF distance is....
In [ ]:
kwip_tree.robinson_foulds(true_tree, unrooted_trees=True)[0]
In [ ]:
from scipy.cluster import hierarchy as hier
In [ ]:
plt.figure(figsize=(10,4))
z = hier.linkage(kwip_dist.condensed_form(), method='complete')
x = hier.dendrogram(z, labels=kwip_dist.ids)
plt.title("kWIP Dendrogram")
plt.savefig('data/kwip-dendro.png')
In [ ]:
plt.figure(figsize=(10,4))
z = hier.linkage(distmat_reps.condensed_form(), method='complete')
x = hier.dendrogram(z, labels=distmat_reps.ids)
plt.title("True Dendrogram")
plt.savefig('data/true-dendro.png')