In [1]:
## Imports and working/output directories directories
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
plt.rcParams["figure.figsize"] = [12,9]
from collections import Counter
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
import toyplot
import toyplot.html ## toypot sublib for saving html plots
import pandas as pd
import numpy as np
import subprocess
import collections
import allel
import vcfnp
import shutil
import gzip
import glob
import os
from allel.util import * ## for ensure_square()
WORK_DIR="/home/iovercast/manuscript-analysis/"
## Simulation dirs
REFMAP_SIM_DIR = os.path.join(WORK_DIR, "REFMAP_SIM/")
IPYRAD_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ipyrad/reference-assembly/")
STACKS_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "stacks/")
DDOCENT_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ddocent/")
## A list of the simluated assembler names for indexing dicts
assemblers = ["ipyrad-reference", "ipyrad-denovo_reference", "stacks", "ddocent"]
## Empirical dirs
REFMAP_EMPIRICAL_DIR=os.path.join(WORK_DIR, "Phocoena_empirical/")
IPYRAD_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "ipyrad/")
STACKS_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "stacks/")
DDOCENT_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "ddocent/")
In [334]:
## LD pruning code from the scikit-allele cookbook
def plot_ld(gn, title):
m = allel.stats.rogers_huff_r(gn) ** 2
ax = allel.plot.pairwise_ld(m)
ax.set_title(title)
def ld_prune(gn, size=1000, step=1000, threshold=.3, n_iter=5):
for i in range(n_iter):
loc_unlinked = allel.stats.ld.locate_unlinked(gn, size=size, step=step, threshold=threshold)
n = np.count_nonzero(loc_unlinked)
n_remove = gn.shape[0] - n
print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
gn = gn.compress(loc_unlinked, axis=0)
return gn
def plotPCA(call_data, title):
c = call_data
g = allel.GenotypeArray(c.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1)
sns.despine(ax=ax, offset=5)
x = coords1[:, 0]
y = coords1[:, 1]
## We know this works because the species_dict and the columns in the vcf
## are in the same order.
for sp in species:
flt = (np.array(species_dict.values()) == sp)
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5)
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
fig.suptitle(title+" pca", y=1.02, style="italic", fontsize=20, fontweight='bold')
fig.tight_layout()
def getPCA(call_data):
c = call_data
g = allel.GenotypeArray(c.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
## Test w/ ld pruning. Doesn't appreciably change the results.
#gnu = ld_prune(gn)
#gn = gnu
coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
return coords1, model1
## We don't actually use this function bcz the allele.plot.pairwise_distance()
## call returns a raster which looks nasty in print.
def plotPairwiseDistance(call_data, title):
c = call_data
#c = vcfnp.calldata_2d(filename).view(np.recarray)
g = allel.GenotypeArray(c.genotype)
gn = g.to_n_alt()
dist = allel.stats.pairwise_distance(gn, metric='euclidean')
allel.plot.pairwise_distance(dist, labels=species_dict.keys())
def getDistances(call_data):
c = call_data
#c = vcfnp.calldata_2d(filename).view(np.recarray)
g = allel.GenotypeArray(c.genotype)
gn = g.to_n_alt()
dist = allel.stats.pairwise_distance(gn, metric='euclidean')
return(dist)
In [3]:
## Inputs to this function are two Counters where keys
## are the base position and values are snp counts.
## The counter doesn't have to be sorted because we sort internally.
def SNP_position_plot(prog, distvar, distpis):
## The last position to consider
maxend = np.array(distvar.keys()).max()
## This does two things, first it sorts in increasing
## order. Second, it creates a count bin for any position
## without snps and sets the count to 0.
distvar = [distvar[x] for x in xrange(maxend)]
distpis = [distpis[x] for x in xrange(maxend)]
## set color theme
colormap = toyplot.color.Palette()
## make a canvas
canvas = toyplot.Canvas(width=800, height=300)
## make axes
axes = canvas.cartesian(xlabel="Position along RAD loci",
ylabel="N variables sites",
gutter=65)
axes.label.text = prog
## x-axis
axes.x.ticks.show = True
axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"}
axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"}
axes.x.ticks.below = 5
axes.x.ticks.above = 0
axes.x.domain.max = maxend
axes.x.ticks.locator = toyplot.locator.Explicit(
range(0, maxend, 5),
map(str, range(0, maxend, 5)))
## y-axis
axes.y.ticks.show=True
axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"}
axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"}
axes.y.ticks.below = 0
axes.y.ticks.above = 5
## add fill plots
x = np.arange(0, maxend)
f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites")
f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites")
## add a horizontal dashed line at the median Nsnps per site
axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"})
axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"})
return canvas, axes
In [4]:
import numpy_indexed as npi
## Get the number of samples with data at each snp
def snp_coverage(call_data):
snp_counts = collections.Counter([x.sum() for x in call_data["GT"] != "./."])
## Fill zero values
return [snp_counts[x] for x in xrange(1, np.array(snp_counts.keys()).max()+1)]
## Get the number of samples with data at each locus
def loci_coverage(var_data, call_data, assembler):
if "stacks" in assembler:
loci = zip(*npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./."))
else:
loci = zip(*npi.group_by(var_data["CHROM"])(call_data["GT"] != "./."))
counts_per_snp = []
for z in xrange(0, len(loci)):
counts_per_snp.append([x.sum() for x in loci[z][1]])
counts = collections.Counter([np.max(x) for x in counts_per_snp])
## Fill all zero values
return [counts[x] for x in xrange(1, np.array(counts.keys()).max()+1)]
## Get total number of snps per sample
def sample_nsnps(call_data):
return [x.sum() for x in call_data["GT"].T != "./."]
## Get total number of loci per sample
def sample_nloci(var_data, call_data, assembler):
if "stacks" in assembler:
locus_groups = npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.")
else:
locus_groups = npi.group_by(v["CHROM"])(c["GT"] != "./.")
by_locus = [x.T for x in locus_groups[1]]
by_sample = np.array([(x).any(axis=1) for x in by_locus])
return [x.sum() for x in by_sample.T]
In [29]:
## Make a new pandas dataframe for holding the coverage results
sim_vcf_dict = {}
sim_vcf_dict["ipyrad-reference"] = os.path.join(IPYRAD_SIM_DIR, "refmap-sim_outfiles/refmap-sim.vcf")
sim_vcf_dict["ipyrad-denovo_plus_reference"] = os.path.join(IPYRAD_SIM_DIR, "denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf")
sim_vcf_dict["ipyrad-denovo_plus_reference"] = os.path.join(IPYRAD_SIM_DIR, "denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf")
sim_vcf_dict["stacks"] = os.path.join(STACKS_SIM_DIR, "batch_1.vcf")
sim_vcf_dict["ddocent"] = os.path.join(DDOCENT_SIM_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
## Make sure we have all the vcf files
for k, f in sim_vcf_dict.items():
if os.path.exists(f):
print("found - {}".format(f))
else:
print("not found - {}".format(f))
Get number of loci per sample, and locus coverage for each vcf file from each assembler and each simulated dataset. This is reading in from the vcf files from each assembly method. This is nice because vcf is relatively standard and all the tools can give us a version of vcf. It's not perfect though because it doesn't include information about monomorphic sites, so it doesn't tell us the true number of loci recovered. We can get an idea of coverage and depth at snps, but to get coverage and depth stats across all loci we need to dig into the guts of the output of each method (which we'll do later).
In [10]:
import collections
sim_loc_cov = collections.OrderedDict()
sim_snp_cov = collections.OrderedDict()
sim_sample_nsnps = collections.OrderedDict()
sim_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in sim_vcf_dict.items():
try:
print("Doing - {}".format(prog))
print(" {}".format(filename))
v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)
sim_snp_cov[prog] = snp_coverage(c)
sim_sample_nsnps[prog] = sample_nsnps(c)
sim_loc_cov[prog] = loci_coverage(v, c, prog)
sim_sample_nlocs[prog] = sample_nloci(v, c, prog)
except Exception as inst:
print(inst)
This isn't very helpful, but you get an idea of what's going on. The ddocent results for locus coverage and number of loci per sample are ugly because for reference sequence mapping the ddocent output vcf doesn't record this information. It only retains CHROM and POS, CHROM being "MT" here and POS being the base position of each snp.
In [11]:
for statname, stat in {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\
"sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}.items():
for prog in sim_vcf_dict.keys():
try:
print(prog + " " + statname + "\t"),
print(stat[prog ]),
print(np.mean(stat[prog]))
except:
print("No {} stats for {}".format(statname, prog))
print("------------------------------------------------------")
In [275]:
## Load the calldata into a dict so we don't have to keep loading and reloading it
calldata = {}
for prog in sim_vcf_dict.keys():
print("{}".format(prog)),
print("{}".format(sim_vcf_dict[prog]))
c = vcfnp.calldata_2d(sim_vcf_dict[prog], verbose=False).view(np.recarray)
calldata[prog] = c
In [276]:
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
sim_sample_names = pop1 + pop2 + pop3
pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
flt = np.in1d(np.array(sim_sample_names), pop1)
print(flt)
In [278]:
f, axarr = plt.subplots(2,2, figsize=(10,10), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(assemblers, axarr):
coords1, model1 = getPCA(calldata[prog])
x = coords1[:, 0]
y = coords1[:, 1]
ax.scatter(x, y, marker='o')
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
for pop in pops.keys():
flt = np.in1d(np.array(sim_sample_names), pops[pop])
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)
ax.set_title(prog, style="italic")
ax.axison = True
f.tight_layout()
In [279]:
f, axarr = plt.subplots(2, 2, figsize=(10, 10), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(assemblers, axarr):
## Calculate pairwise distances
dist = getDistances(calldata[prog])
## Doing it this way works, but allel uses imshow internally which rasterizes the image
#allel.plot.pairwise_distance(dist, labels=None, ax=ax, colorbar=False)
## Create the pcolormesh by hand
dat = ensure_square(dist)
## for some reason np.flipud(dat) is chopping off one row of data
p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\
cmap="jet", vmin=np.min(dist), vmax=np.max(dist))
## Clip all heatmaps to actual sample size
p.axes.axis("tight")
ax.set_title(prog, style="italic")
ax.axison = False
What we want to know is total number of loci recovered (not just variable loci). First we'll create some dictionaries just like above, but we'll call them full to indicate that they include monomorphic sites. Because snps don't occur in monomorphic sites kind of by definition, we only really are iterested in the depth across loci and the number of loci recovered per sample.
In [9]:
## Blank the ordered dicts for gathering locus coverage and sample nlocs
sim_full_loc_cov = collections.OrderedDict()
sim_full_sample_nlocs = collections.OrderedDict()
In [13]:
assembly_methods = {"ipyrad-reference":"refmap-sim", "ipyrad-denovo_reference":"denovo_ref-sim"}
for name, method in assembly_methods.items():
print(method)
simdir = os.path.join(IPYRAD_SIM_DIR, method + "_outfiles/")
statsfile = simdir + "{}_stats.txt".format(method)
infile = open(statsfile).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:32]]
print(sample_coverage)
print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
sim_full_sample_nlocs[name] = sample_coverage
nmissing = [int(x.strip().split()[1]) for x in infile[38:50]]
sim_full_loc_cov[name] = nmissing
## Just look at the ones we care about for ipyrad
print(sim_full_loc_cov.items())
print(sim_full_sample_nlocs.items())
In [14]:
method = "stacks"
simdir = STACKS_SIM_DIR
try:
lines = open("{}/batch_1.haplotypes.tsv".format(simdir)).readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
sim_full_loc_cov[method] = [cnts.count(i) for i in range(1,13)]
except Exception as inst:
print("loc_cov - {} - {}".format(inst, simdir))
try:
sim_full_sample_nlocs[method] = []
samp_haps = glob.glob("{}/*matches*".format(simdir))
for f in samp_haps:
lines = gzip.open(f).readlines()
sim_full_sample_nlocs[method].append(len(lines) - 1)
except Exception as inst:
print("sample_nlocs - {} - {}".format(inst, simdir))
print(sim_full_loc_cov.items())
print(sim_full_sample_nlocs.items())
In [10]:
DDOCENT_DIR = "/home/iovercast/manuscript-analysis/dDocent/"
def ddocent_stats(RUNDIR):
## This is hackish. You have to dip into the bam file to get locus counts that
## include counts for monomorphics. Note! that field 3 of the bam file is not
## the sequence data, but rather the dDocent mock-contig ('dDocent_Contig_3')
## the read maps to. So really this is kind of by proxy counting the number of loci.
sample_coverage = []
for samp in glob.glob(RUNDIR + "/*-RG.bam"):
cmd = "{}samtools view {} | cut -f 4 | uniq | wc -l".format(DDOCENT_DIR, samp)
res = subprocess.check_output(cmd, shell=True)
sample_coverage.append(int(res.strip()))
print("This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.")
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
cmd = "{}samtools view {}cat-RRG.bam | cut -f 3,4 | uniq".format(DDOCENT_DIR, RUNDIR)
locus_positions = subprocess.check_output(cmd, shell=True).split("\n")
locus_counter = collections.Counter() # Num loci per sample
coverage_counter = collections.Counter() # Coverage per locus
for loc in locus_positions:
if not loc:
continue
chrom = loc.split()[0]
pos = int(loc.split()[1])
cmd = "{}samtools view {}cat-RRG.bam {}:{}-{} | cut -f 1 | cut -f 3 -d \"_\" | sort | uniq".format(DDOCENT_DIR, RUNDIR, chrom, pos, pos+1)
res = subprocess.check_output(cmd, shell=True).split()
locus_counter.update(res)
coverage_counter.update([len(res)])
## Fill in zero values that aren't in the locus counter
dat = []
for i in xrange(1,13):
try:
dat.append(coverage_counter[i])
except:
dat.append(0)
return dat, locus_counter.values()
loc_cov, sample_nlocs = ddocent_stats(DDOCENT_SIM_DIR)
sim_full_loc_cov["ddocent"] = loc_cov
sim_full_sample_nlocs["ddocent"] = sample_nlocs
print(sim_full_loc_cov.items())
print(sim_full_sample_nlocs.items())
In [313]:
simlevel = ["simulated"]
sim_sample_nlocs_df = pd.DataFrame(index=assemblers, columns=simlevel)
for sim in simlevel:
for ass in assemblers:
simstring = ass
sim_sample_nlocs_df[sim][ass] = np.mean(sim_full_sample_nlocs[simstring])
print("Mean number of loci recovered per sample.")
## Normalize all bins
dat = sim_sample_nlocs_df[sim_sample_nlocs_df.columns].astype(float)
for sim in simlevel:
scale = 1000
# if "ipyrad-reference" == sim
dat[sim] = dat[sim]/scale
dat[sim]["ipyrad-reference"] *= 2
sns.heatmap(dat, square=True, center=1, linewidths=2, annot=True)
print(sim_sample_nlocs_df)
In [60]:
dfsns2 = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"])
for sim in simlevel:
for ass in assemblers:
simstring = ass
## Normalize values so different sim sizes print the same
max = 1000.
newdat = sim_full_loc_cov[simstring]
newdat = [sum(newdat)-sum(newdat[:i-1]) for i in range(1,13)]
for i, val in enumerate(newdat):
dfsns2.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val]
g = sns.FacetGrid(dfsns2, col="simtype", hue="assembler", size=1)
g.map(plt.scatter, "bin", "simdata")
g.map(plt.plot, "bin", "simdata").add_legend()
#axs = g.axes
#for ax in axs:
# ax.set_xlim(0,12.5)
In [123]:
emp_vcf_dict = {}
emp_vcf_dict["ipyrad-reference"] = os.path.join(IPYRAD_REFMAP_DIR, "refmap-empirical_outfiles/refmap-empirical.vcf")
emp_vcf_dict["ipyrad-denovo_reference"] = os.path.join(IPYRAD_REFMAP_DIR, "denovo_ref-empirical_outfiles/denovo_ref-empirical.vcf")
emp_vcf_dict["stacks"] = os.path.join(STACKS_REFMAP_DIR, "batch_1.vcf")
emp_vcf_dict["ddocent"] = os.path.join(DDOCENT_REFMAP_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
In [124]:
emp_loc_cov = collections.OrderedDict()
emp_snp_cov = collections.OrderedDict()
emp_sample_nsnps = collections.OrderedDict()
emp_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in emp_vcf_dict.items():
try:
print("Doing - {}".format(prog))
print("\t{}".format(filename))
v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
emp_loc_cov[prog] = loci_coverage(v, c, prog)
emp_snp_cov[prog] = snp_coverage(c)
emp_sample_nsnps[prog] = sample_nsnps(c)
emp_sample_nlocs[prog] = sample_nloci(v, c, prog)
plotPCA(c, prog)
plotPairwiseDistance(c, prog)
except Exception as inst:
print(inst)
In [608]:
for i in emp_sample_nlocs:
print(i),
print(emp_sample_nlocs[i])
In [70]:
IPYRAD_EMPIRICAL_OUTPUT=os.path.join(IPYRAD_DIR, "REALDATA/")
IPYRAD_STATS = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA_stats.txt")
infile = open(IPYRAD_STATS).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:33]]
print(sample_coverage)
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
Read the biallelic vcf
In [71]:
filename = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA.biallelic.vcf")
# filename = vcf_dict["ipyrad"]
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
Getting variable sites and parsimony informative sites from the vcf is kind of annoying because all the programs export slightly different formats, so you need to parse them in slightly different ways. There's a better way to do this for ipyrad but i figure i'll do it the same way for all of them so it's more clear what's happening.
In [72]:
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
Out[72]:
In [12]:
STACKS_OUTPUT=os.path.join(STACKS_DIR, "REALDATA/")
STACKS_GAP_OUT=os.path.join(STACKS_OUTPUT, "gapped/")
STACKS_UNGAP_OUT=os.path.join(STACKS_OUTPUT, "ungapped/")
STACKS_DEFAULT_OUT=os.path.join(STACKS_OUTPUT, "default/")
#lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines()
#cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
#shigh = [cnts.count(i) for i in range(1,13)]
In [13]:
filename = os.path.join(STACKS_UNGAP_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
In [14]:
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x.split("_")[1]) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x.split("_")[1]) for x in v.ID])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
Out[14]:
In [349]:
vcf_dict = {}
vcf_dict["ipyrad-reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "refmap-sim_outfiles/refmap-sim.vcf")
vcf_dict["ipyrad-denovo_plus_reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf")
vcf_dict["ipyrad-denovo_minus_reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf")
vcf_dict["stacks-sim"] = os.path.join(STACKS_SIM_DIR, "batch_1.vcf")
vcf_dict["ddocent-tot-sim"] = os.path.join(DDOCENT_SIM_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
vcf_dict["ddocent-fin-sim"] = os.path.join(DDOCENT_SIM_DIR, "Final.recode.snps.vcf.recode.vcf")
vcf_dict["ipyrad-reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/refmap-empirical_outfiles/refmap-empirical.vcf")
vcf_dict["ipyrad-denovo_plus_reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.vcf")
vcf_dict["ipyrad-denovo_plus_reference-095-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_plus_reference-095_outfiles/denovo_plus_reference-095.vcf")
vcf_dict["ipyrad-denovo_minus_reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_minus_reference_outfiles/denovo_ref-empirical.vcf")
vcf_dict["stacks-empirical"] = os.path.join(STACKS_REFMAP_DIR, "batch_1.vcf")
vcf_dict["ddocent-fin-empirical"] = os.path.join(DDOCENT_REFMAP_DIR, "Final.recode.snps.vcf.recode.vcf")
# skipt the full ddocent vcf. It's huge and we know we don't want to use it.
#vcf_dict["ddocent-tot-empirical"] = os.path.join(DDOCENT_REFMAP_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
## Make sure we have all the vcf files
for k, f in vcf_dict.items():
if os.path.exists(f):
print("found - {}".format(f))
else:
print("not found - {}".format(f))
In [56]:
## sim
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
sim_sample_names = pop1 + pop2 + pop3
sim_pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
sim_pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
## empirical
emp_pops = {}
emp_sample_names = []
popfile = os.path.join(STACKS_REFMAP_DIR, "popmap.txt")
with open(popfile) as infile:
lines = [l.strip().split() for l in infile.readlines()]
emp_sample_names = [x[0] for x in lines]
pops = set([x[1] for x in lines])
for pop in pops: emp_pops[pop] = []
for line in lines:
p = line[1]
s = line[0]
emp_pops[p].append(s)
emp_pop_colors = {k:v for (k,v) in zip(emp_pops, list(matplotlib.colors.cnames))}
## Write out the samples to pop files for vcftools
for outdir, pop_dict in {REFMAP_SIM_DIR:sim_pops, REFMAP_EMPIRICAL_DIR:emp_pops}.items():
for pop, samps in pop_dict.items():
with open(outdir + pop + ".txt", "w") as outfile:
outfile.write("\n".join(samps))
In [351]:
## The order of the samples in the vcf file is wonky. Reorder them so they're sorted by population. This
## is roughly the order of populations from the manuscript.
## If you run this multiple times, you should re-run the previous cell that resets
## the vcf file to the original path, otherwise you get a bunch of *.reorder.reorder.vcf files.
order = ["WBS", "IS", "NOS", "SK1", "KB1", "BES2", "IBS"]
vcf_header = """##fileformat=VCFv4.1\n##fileDate=20161230\n##source="Stacks v1.42"\n##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">\n##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Allele Depth">\n##FORMAT=<ID=GL,Number=.,Type=Float,Description="Genotype Likelihood">\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"""
with open("/tmp/dummy.vcf", 'w') as outfile:
outfile.write(vcf_header)
for p in order:
outfile.write("\t" + "\t".join(emp_pops[p]))
for k, f in vcf_dict.items():
if "stacks" in k:
## The version of vcftools we have here doesn't like the newer version of vcf stacks
## writes, so we have to 'fix' the stacks vcf version
print("Fixing stacks vcf file format - {}".format(k))
shutil.copy2(f, f+".bak")
with open(f) as infile:
dat = infile.readlines()[1:]
with open(f, 'w') as outfile:
outfile.write("##fileformat=VCFv4.1\n")
outfile.write("".join(dat))
if "empirical" in k:
print("reordering")
vcftools_path = os.path.join(DDOCENT_DIR, "vcftools_0.1.11/perl/")
os.chdir(vcftools_path)
tmpvcf = "tmp.vcf"
newvcf = f.rsplit(".", 1)[0] + ".reordered.vcf"
print(newvcf)
cmd = "perl vcf-shuffle-cols -t {} {} > {}".format("/tmp/dummy.vcf", f, tmpvcf)
print(cmd)
os.system(cmd)
!mv $tmpvcf $newvcf
## Don't destroy the old one
vcf_dict[k] = newvcf
In [352]:
import collections
## Load the calldata into a dict so we don't have to keep loading and reloading it
## This can take several minutes for the large empirical datasets (like 20+ minutes)
all_calldata = {}
all_vardata = {}
## Dicts for tracking all the stats
loc_cov = collections.OrderedDict()
snp_cov = collections.OrderedDict()
samp_nsnps = collections.OrderedDict()
samp_nlocs = collections.OrderedDict()
for prog, filename in vcf_dict.items():
try:
print("Doing - {}\n {}".format(prog, filename))
v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)
all_calldata[prog] = c
all_vardata[prog] = v
snp_cov[prog] = snp_coverage(c)
samp_nsnps[prog] = sample_nsnps(c)
loc_cov[prog] = loci_coverage(v, c, prog)
samp_nlocs[prog] = sample_nloci(v, c, prog)
except Exception as inst:
print(inst)
In [353]:
for statname, stat in {"loc_cov":loc_cov, "snp_cov":snp_cov,\
"sample_nsnps":samp_nsnps, "sample_nlocs":samp_nlocs}.items():
for prog in vcf_dict.keys():
try:
print(prog + " " + statname + "\t"),
print(stat[prog]),
print(np.mean(stat[prog]))
except:
print("No {} stats for {}".format(statname, prog))
print("------------------------------------------------------")
In [354]:
f, axarr = plt.subplots(2, 5, figsize=(20,8), dpi=1000)
## Make a list out of the axes
axarr = [a for b in axarr for a in b]
## Set them in order so the plot looks nice.
progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
"ipyrad-reference-empirical", "ipyrad-denovo_plus_reference-095-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]
for prog, ax in zip(progs, axarr):
print(prog, ax)
if "empirical" in prog:
pop_colors = emp_pop_colors
pops = emp_pops
sample_names = emp_sample_names
else:
pop_colors = sim_pop_colors
pops = sim_pops
sample_names = sim_sample_names
## Don't die if some of the runs aren't complete
try:
coords1, model1 = getPCA(all_calldata[prog])
except:
continue
x = coords1[:, 0]
y = coords1[:, 1]
ax.scatter(x, y, marker='o')
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
for pop in pops.keys():
flt = np.in1d(np.array(sample_names), pops[pop])
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)
ax.set_title(prog, style="italic")
ax.axison = True
axarr[0].legend(frameon=True)
axarr[5].legend(loc='lower left', frameon=True)
f.tight_layout()
In [355]:
f, axarr = plt.subplots(2, 5, figsize=(20,8), dpi=1000)
axarr = [a for b in axarr for a in b]
## Set them in order so the plot looks nice.
progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
"ipyrad-reference-empirical", "ipyrad-denovo_plus_reference-095-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]
for prog, ax in zip(progs, axarr):
print(prog, ax)
try:
## Calculate pairwise distances
dist = getDistances(all_calldata[prog])
except:
continue
## Create the pcolormesh by hand
dat = ensure_square(dist)
## for some reason np.flipud(dat) is chopping off one row of data
p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\
cmap="jet", vmin=np.min(dist), vmax=np.max(dist))
## Clip all heatmaps to actual sample size
p.axes.axis("tight")
ax.set_title(prog, style="italic")
ax.axison = False
## Adjust margins to make room for the colorbar
plt.subplots_adjust(left=0.05, bottom=0.1, right=0.8, top=0.9, wspace=0.2, hspace=0.3)
## Add the colorbar
cax = f.add_axes([0.87, 0.4, 0.03, 0.5])
cb1 = matplotlib.colorbar.ColorbarBase(cax, cmap="jet", orientation="vertical", ticks=([0,1]))
cb1.ax.set_yticklabels(['Similar', "Dissimilar"], weight="bold", rotation="vertical")
Out[355]:
In [357]:
## Blank the ordered dicts for gathering locus coverage and sample nlocs
all_full_loc_cov = collections.OrderedDict()
all_full_sample_nlocs = collections.OrderedDict()
## Mapping between 'pretty names' and directory names. This is only really useful for ipyrad
## where i named the directories a little silly. For stacks and ddocent this doesn't really do anything.
assembly_methods = {"ipyrad-reference-sim":"refmap-sim", "ipyrad-denovo_reference-sim":"denovo_ref-sim",\
"stacks-sim":"stacks-sim", "ipyrad-reference-empirical":"refmap-empirical",\
"stacks-empirical":"stacks-empirical", "ddocent-simulated":"ddocent-simulated",\
"ddocent-empirical":"ddocent-empirical",\
"ipyrad_denovo_minus_reference-sim":"denovo_minus_reference-sim",\
"ipyrad_denovo_plus_reference-sim":"denovo_plus_reference-sim",\
"ipyrad_denovo_minus_reference-empirical":"denovo_minus_reference",\
"ipyrad_denovo_plus_reference-empirical":"denovo_plus_reference-095"
}
for name, method in assembly_methods.items():
print("Doing - {}".format(name))
if "ipyrad" in name:
outdir = IPYRAD_SIM_DIR
firstsamp = 20
lastsamp = 32
if "empirical" in name:
outdir = IPYRAD_REFMAP_DIR + "reference-assembly/"
firstsamp = 20 #fixme
lastsamp = 64 # fixme
try:
nsamps = lastsamp - firstsamp
simdir = os.path.join(outdir, method + "_outfiles/")
statsfile = simdir + "{}_stats.txt".format(method)
infile = open(statsfile).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[firstsamp:lastsamp]]
print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
all_full_sample_nlocs[name] = sample_coverage
nmissing = [int(x.strip().split()[1]) for x in infile[lastsamp+6:lastsamp+6+nsamps]]
all_full_loc_cov[name] = nmissing
except Exception as inst:
print(inst)
elif "stacks" in name:
outdir = STACKS_SIM_DIR
nsamps = 12
if "empirical" in name:
outdir = STACKS_REFMAP_DIR
nsamps = 44
try:
## Effectively the same as thisjjjj
## cut -f 2 batch_1.haplotypes.tsv | sort -n | uniq -c | less
lines = open("{}/batch_1.haplotypes.tsv".format(outdir)).readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
all_full_loc_cov[method] = [cnts.count(i) for i in range(1,nsamps+1)]
except Exception as inst:
print("loc_cov - {} - {}".format(inst, simdir))
try:
all_full_sample_nlocs[method] = []
## This is actually number of loci
## cut -f 3 *matches | sort -n | uniq | wc -l
## Right now this is actually counting number of snps
samp_haps = glob.glob("{}/*matches*".format(outdir))
for f in samp_haps:
lines = gzip.open(f).readlines()
all_full_sample_nlocs[method].append(len(lines) - 1)
except Exception as inst:
print("sample_nlocs - {} - {}".format(inst, outdir))
# elif "ddocent" in name:
# outdir = DDOCENT_SIM_DIR
# if "empirical" in name:
# outdir = DDOCENT_REFMAP_DIR
# loc_cov, sample_nlocs = ddocent_stats(outdir)
# all_full_loc_cov[method] = loc_cov
# all_full_sample_nlocs[method] = sample_nlocs
else:
print("wtf?")
try:
print(all_full_loc_cov[name])
print(all_full_sample_nlocs[name])
except:
print("no data for - {}".format(name))
pass
print("Done - {}\n\n".format(name))
print(all_full_loc_cov.items())
print(all_full_sample_nlocs.items())
In [359]:
import itertools
## dict for storing fst values. This is a dict of dictionaries. The top level is for
## each assembly type and within each assembly type is a dict of pop pairs and the fst between them.
fsts = {}
## Get pairwise fst for each pop for sim and empirical
d = {REFMAP_SIM_DIR:sim_pops, REFMAP_EMPIRICAL_DIR:emp_pops}
for k, v in vcf_dict.items():
if not os.path.exists(v):
continue
if "sim" in k:
indir = REFMAP_SIM_DIR
pop_dict = sim_pops
else:
indir = REFMAP_EMPIRICAL_DIR
pop_dict = emp_pops
os.chdir(indir)
try:
print("Doing - {}".format(k))
fsts[k] = {}
combinations = list(itertools.combinations(pop_dict, 2))
for pair in combinations:
pop1 = indir + pair[0] + ".txt"
pop2 = indir + pair[1] + ".txt"
vcftools_cmd = DDOCENT_DIR + "vcftools"
cmd = "{} --vcf {} --weir-fst-pop {} --weir-fst-pop {} --out {}".format(vcftools_cmd, v, pop1, pop2, k)
ret = subprocess.check_output(cmd, shell=True)
ret = ret.split("\n")
## Get the mean fst from the output
fst = [x for x in ret if "Weir and Cockerham mean" in x][0].split(": ")[1]
weighted_fst = [x for x in ret if "Weir and Cockerham weighted" in x][0].split(": ")[1]
print("-".join(pair), "{}/{}".format(fst, weighted_fst)),
fsts[k]["-".join(pair)] = "{}/{}".format(fst, weighted_fst)
print("\n")
except Exception as inst:
print("Failed - {} - {}".format(k, inst))
## Allow it to fail if the vcf doesn't exist.
pass
In [360]:
from IPython.display import display
import pandas as pd
## Organize and pretty print the fsts dict
df_sim = pd.DataFrame(index=sim_pops.keys(), columns=sim_pops.keys(), dtype=str).fillna("")
df_emp = pd.DataFrame(index=emp_pops.keys(), columns=emp_pops.keys(), dtype=str).fillna("")
for assembler, all_data in fsts.items():
print("{}".format(assembler))
if "sim" in assembler:
indir = REFMAP_SIM_DIR
pop_dict = sim_pops
df = df_sim
else:
indir = REFMAP_EMPIRICAL_DIR
pop_dict = emp_pops
df = df_emp
## Init the diagonal
for p in pop_dict.keys():
df[p][p] = ""
for p, fst_data in all_data.items():
#print(df)
p1 = p.split("-")[0]
p2 = p.split("-")[1]
df[p1][p2] += " "+fst_data.split("/")[0][:6]
df[p2][p1] += " "+fst_data.split("/")[1][:6]
pd.set_option('display.max_colwidth',800)
pd.set_option('display.width',80)
df_sim.style.set_properties(**{'max-width':'10px'})
display(df_sim)
display(df_emp)
In [177]:
%%bash -s "$WORK_DIR"
## Install raxml
mkdir $1/miniconda/src
cd $1/miniconda/src
git clone https://github.com/stamatak/standard-RAxML.git
cd standard-RAxML
make -f Makefile.PTHREADS.gcc
cp raxml-PTHREADS $1/miniconda/bin
In [211]:
## Get this script to convert from vcf to phy
## This script wants python 3, so I had to go in and add the future import for
## print by hand. Add this on line 17:
##
## from __future__ import print_function
##
## Also, had to update lines 32-34 like so:
##
## iupac = {"AG": "R", "CT": "Y", "CG": "S", "AT": "W", "GT": "K", "AC": "M",
## "CGT": "B", "AGT": "D", "ACT": "H", "ACG": "V", "ACGT": "N", "AA": "A",
## "CC": "C", "GG": "G", "TT":"T", "GN":"N", "CN":"N", "AN":"N", "TN":"N"}
## "NT":"N", "NC":"N", "NG":"N", "NA":"N", "NN":"N"}
## Also, the when we decompose the ddocent complex genotypes vcftools splits
## ./. data as '.', which VCF2phy.py kinda hates, so you have to add this piece
## of code on lines 75/75:
##
## if genotype_string == ".":
## return "N"
#!wget https://raw.githubusercontent.com/CoBiG2/RAD_Tools/master/VCF2phy.py
for k, v in vcf_dict.items():
if "stacks" in k and "sim" in k:
outphy = v.rsplit(".", 1)[0]
print(outphy)
cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy)
ret = subprocess.check_output(cmd, shell=True)
print(ret)
if "ddocent" in k:
## Skip doing the big boy for now
if "empirical" in k and "Total" in v:
continue
## Do not add .phy to outphy, the python script does it for us.
outphy = v.rsplit(".", 1)[0]
print(outphy)
cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy)
ret = subprocess.check_output(cmd, shell=True)
print(ret)
In [361]:
for d in [REFMAP_SIM_DIR, REFMAP_EMPIRICAL_DIR]:
raxoutdir = d + "raxml_outdir"
if not os.path.exists(raxoutdir):
os.mkdir(raxoutdir)
emp_trees = {}
sim_trees = {}
for assembler, vcffile in vcf_dict.items():
if "sim" in assembler:
outdir = REFMAP_SIM_DIR + "raxml_outdir"
ncores = 2
out_trees = sim_trees
physplit = 1
else:
outdir = REFMAP_EMPIRICAL_DIR + "raxml_outdir"
ncores = 20
out_trees = emp_trees
physplit = 2
if "ipyrad" in assembler:
inputfile = vcffile.rsplit(".", physplit)[0] + ".snps.phy"
if "stacks" in assembler:
inputfile = vcffile.rsplit(".", 2)[0] + ".phylip"
if "ddocent" in assembler:
inputfile = vcffile.rsplit(".", 1)[0] + ".phy"
if not os.path.exists(inputfile):
print("\nCan't find .phy or .phylip for - {}\n".format(inputfile.rsplit(".", 1)[0]))
cmd = "{}raxmlHPC-PTHREADS -f a ".format(WORK_DIR + "/miniconda/bin/") \
+ " -T {} ".format(ncores) \
+ " -m GTRGAMMA " \
+ " -N 100 " \
+ " -x 12345 " \
+ " -p 54321 " \
+ " -n {} ".format(assembler) \
+ " -w {} ".format(outdir) \
+ " -s {}".format(inputfile)
print(cmd)
## What's the difference?
## out_trees[assembler] = "{}/RAxML_bestTree.{}".format(outdir, assembler)
out_trees[assembler] = "{}/RAxML_bipartitions.{}".format(outdir, assembler)
#continue
if "sim" in assembler:
print(assembler)
#!time $cmd
if "empirical" in assembler:
print(assembler)
#!time $cmd
In [368]:
#!conda install biopython
from Bio import Phylo
f, axarr = plt.subplots(1, 5, figsize=(20,8), dpi=1000)
## For more than 1 row uncomment this
#axarr = [a for b in axarr for a in b]
## Pop membership and colors per pop were estabilshed above during the PCA
#sim_sample_names = pop1 + pop2 + pop3
#sim_pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
#sim_pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
sim_colors_per_sample = {}
for samp in sim_sample_names:
for pop in sim_pops:
if samp in sim_pops[pop]:
sim_colors_per_sample[samp] = sim_pop_colors[pop]
## Set them in order so the plot looks nice.
#progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
# "ipyrad-reference-empirical", "ipyrad-denovo_reference-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]
outgroup = [{'name': taxon_name} for taxon_name in pop3]
for assembler, ax in zip(sim_trees.keys(), axarr):
print(assembler, sim_trees[assembler], ax)
tree = Phylo.read(sim_trees[assembler], 'newick')
tree.ladderize()
tree.root_with_outgroup(*outgroup)
## This could be cool but doesn't work
Phylo.draw(tree, axes = ax, do_show=False, label_colors=sim_colors_per_sample)
ax.set_title(assembler)
plt.tight_layout()
In [367]:
del sim_trees['ddocent-tot-sim']
print(len(sim_trees))
print(sim_trees)
In [ ]:
## Set empirical colors
emp_colors_per_sample = {}
for samp in emp_sample_names:
for pop in emp_pops:
if samp in emp_pops[pop]:
emp_colors_per_sample[samp] = emp_pop_colors[pop]
In [ ]:
import ete3
In [ ]:
for prog in all_calldata.keys():
#for prog in ["ipyrad-reference-sim"]:
print("Doing {}".format(prog))
try:
c = all_calldata[prog]
v = all_vardata[prog]
except:
print("Failed to load for {}".format(prog))
continue
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(prog, distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
In [ ]:
## Checking out differences between 0.85 and 0.95 clustering threshold for ipyrad
In [339]:
filename = "/home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_minus_reference-0.95_outfiles/denovo_ref-empirical.vcf"
## Only need to load once
#myv = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
#myc = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)
f, axarr = plt.subplots(1, 2, figsize=(20,8), dpi=1000)
## Make a list out of the axes
## Set them in order so the plot looks nice.
progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
"ipyrad-reference-empirical", "ipyrad-denovo_reference-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]
rundict = {"0.85":all_calldata["ipyrad-denovo_minus_reference-empirical"], "0.95":myc}
for prog, ax in zip(rundict.keys(), axarr):
pop_colors = emp_pop_colors
pops = emp_pops
sample_names = emp_sample_names
## Don't die if some of the runs aren't complete
try:
coords1, model1 = getPCA(rundict[prog])
except:
continue
x = coords1[:, 0]
y = coords1[:, 1]
ax.scatter(x, y, marker='o')
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
for pop in pops.keys():
flt = np.in1d(np.array(sample_names), pops[pop])
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)
ax.set_title(prog, style="italic")
ax.axison = True
axarr[0].legend(frameon=True)
Out[339]:
In [329]:
def plot_ld(gn, title):
m = allel.stats.rogers_huff_r(gn) ** 2
ax = allel.plot.pairwise_ld(m)
ax.set_title(title)
def ld_prune(gn, size=1000, step=1000, threshold=.3, n_iter=5):
for i in range(n_iter):
loc_unlinked = allel.stats.ld.locate_unlinked(gn, size=size, step=step, threshold=threshold)
n = np.count_nonzero(loc_unlinked)
n_remove = gn.shape[0] - n
print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
gn = gn.compress(loc_unlinked, axis=0)
return gn
In [293]:
g = allel.GenotypeArray(myc.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
In [303]:
gnu = ld_prune(gn, size=50, step=200, threshold=.1, n_iter=5)
In [ ]:
coords1, model1 = allel.pca(gnu, n_components=10, scaler='patterson')