In [8]:
## Prereqs for plotting and analysis
#!conda install matplotlib
#!conda install libgcc # for vcfnp
#!pip install vcfnp
#!pip install scikit-allel
#!conda install -y seaborn
#!pip install toyplot
#!pip install numpy_indexed
In [1]:
## Imports and working/output directories directories
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [12,9]
from collections import Counter
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
import toyplot
import toyplot.html ## toypot sublib for saving html plots
import pandas as pd
import numpy as np
import collections
import allel
import vcfnp
import shutil
import glob
import os
from allel.util import * ## for ensure_square()
## Set the default directories for exec and data.
WORK_DIR="/home/iovercast/manuscript-analysis/"
EMPERICAL_DATA_DIR=os.path.join(WORK_DIR, "example_empirical_rad/")
IPYRAD_DIR=os.path.join(WORK_DIR, "ipyrad/")
PYRAD_DIR=os.path.join(WORK_DIR, "pyrad/")
STACKS_DIR=os.path.join(WORK_DIR, "stacks/")
AFTRRAD_DIR=os.path.join(WORK_DIR, "aftrRAD/")
DDOCENT_DIR=os.path.join(WORK_DIR, "dDocent/")
## tmp for testing
#IPYRAD_OUTPUT = "./arch/ipyrad/REALDATA/"
#PYRAD_OUTPUT = "./arch/pyrad/REALDATA/"
os.chdir(WORK_DIR)
In [2]:
## Get sample names and assign them to a species dict
IPYRAD_STATS=os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA_stats.txt")
infile = open(IPYRAD_STATS).readlines()
sample_names = [x.strip().split()[0] for x in infile[20:33]]
species = set([x.split("_")[1] for x in sample_names])
species_dict = collections.OrderedDict([])
## Ordered dict of sample names and their species, in same order
## as the vcf file
for s in sample_names:
species_dict[s] = s.split("_")[1]
print(species_dict)
## Map species names to groups of individuals
#for s in species:
# species_dict[s] = [x for x in sample_names if s in x]
#print(species_dict)
species_colors = {
'rex': '#FF0000',
'cyathophylla': '#008000',
'thamno': '#00FFFF',
'cyathophylloides': '#90EE90',
'przewalskii': '#FFA500',
'superba': '#8B0000',
}
In [4]:
def plotPCA(call_data, title):
c = call_data
g = allel.GenotypeArray(c.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1)
sns.despine(ax=ax, offset=5)
x = coords1[:, 0]
y = coords1[:, 1]
## We know this works because the species_dict and the columns in the vcf
## are in the same order.
for sp in species:
flt = (np.array(species_dict.values()) == sp)
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5)
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
fig.suptitle(title+" pca", y=1.02, style="italic", fontsize=20, fontweight='bold')
fig.tight_layout()
def getPCA(call_data):
c = call_data
g = allel.GenotypeArray(c.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
return coords1, model1
"""
fig = plt.figure(figsize=(5, 5))
x = coords1[:, 0]
y = coords1[:, 1]
## We know this works because the species_dict and the columns in the vcf
## are in the same order.
# for sp in species:
# flt = (np.array(species_dict.values()) == sp)
# ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5)
ax.plot(x, y, marker='o', markersize=10)
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
"""
def plotPairwiseDistance(call_data, title):
c = call_data
#c = vcfnp.calldata_2d(filename).view(np.recarray)
g = allel.GenotypeArray(c.genotype)
gn = g.to_n_alt()
dist = allel.stats.pairwise_distance(gn, metric='euclidean')
allel.plot.pairwise_distance(dist, labels=species_dict.keys())
def getDistances(call_data):
c = call_data
#c = vcfnp.calldata_2d(filename).view(np.recarray)
g = allel.GenotypeArray(c.genotype)
gn = g.to_n_alt()
dist = allel.stats.pairwise_distance(gn, metric='euclidean')
return(dist)
In [5]:
## Inputs to this function are two Counters where keys
## are the base position and values are snp counts.
## The counter doesn't have to be sorted because we sort internally.
def SNP_position_plot(distvar, distpis):
## The last position to consider
maxend = max(distvar.keys())
## This does two things, first it sorts in increasing
## order. Second, it creates a count bin for any position
## without snps and sets the count to 0.
distvar = [distvar[x] for x in xrange(maxend)]
distpis = [distpis[x] for x in xrange(maxend)]
## set color theme
colormap = toyplot.color.Palette()
## make a canvas
canvas = toyplot.Canvas(width=800, height=300)
## make axes
axes = canvas.cartesian(xlabel="Position along RAD loci",
ylabel="N variables sites",
gutter=65)
## x-axis
axes.x.ticks.show = True
axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"}
axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"}
axes.x.ticks.below = 5
axes.x.ticks.above = 0
axes.x.domain.max = maxend
axes.x.ticks.locator = toyplot.locator.Explicit(
range(0, maxend, 5),
map(str, range(0, maxend, 5)))
## y-axis
axes.y.ticks.show=True
axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"}
axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"}
axes.y.ticks.below = 0
axes.y.ticks.above = 5
## add fill plots
x = np.arange(0, maxend)
f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites")
f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites")
## add a horizontal dashed line at the median Nsnps per site
axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"})
axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"})
return canvas, axes
In [6]:
import numpy_indexed as npi
## Get the number of samples with data at each snp
def snp_coverage(call_data):
snp_counts = collections.Counter([x.sum() for x in call_data["GT"] != "./."])
## Fill zero values
return [snp_counts[x] for x in xrange(1, max(snp_counts.keys())+1)]
## Get the number of samples with data at each locus
def loci_coverage(var_data, call_data, assembler):
if "stacks" in assembler:
loci = zip(*npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./."))
else:
loci = zip(*npi.group_by(var_data["CHROM"])(call_data["GT"] != "./."))
counts_per_snp = []
for z in xrange(0, len(loci)):
counts_per_snp.append([x.sum() for x in loci[z][1]])
counts = collections.Counter([np.max(x) for x in counts_per_snp])
## Fill all zero values
return [counts[x] for x in xrange(1, max(counts.keys())+1)]
## Get total number of snps per sample
def sample_nsnps(call_data):
return [x.sum() for x in call_data["GT"].T != "./."]
## Get total number of loci per sample
def sample_nloci(var_data, call_data, assembler):
if "stacks" in assembler:
locus_groups = npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.")
else:
locus_groups = npi.group_by(v["CHROM"])(c["GT"] != "./.")
by_locus = [x.T for x in locus_groups[1]]
by_sample = np.array([(x).any(axis=1) for x in by_locus])
return [x.sum() for x in by_sample.T]
In [7]:
## Make a new pandas dataframe for holding the coverage results
## this is going to
sim_vcf_dict = {}
for sim in ["simno", "simlo", "simhi", "simlarge"]:
sim_vcf_dict["ipyrad-"+sim] = os.path.join(IPYRAD_DIR, "SIMDATA/{}/{}_outfiles/{}.vcf".format(sim, sim, sim))
sim_vcf_dict["pyrad-"+sim] = os.path.join(PYRAD_DIR, "SIMDATA/{}/outfiles/c85d6m2p3H3N3.vcf".format(sim))
sim_vcf_dict["stacks_ungapped-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/ungapped/{}/batch_1.vcf".format(sim))
sim_vcf_dict["stacks_gapped-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/gapped/{}/batch_1.vcf".format(sim))
sim_vcf_dict["stacks_defualt-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/default/{}/batch_1.vcf".format(sim))
sim_vcf_dict["aftrrad-"+sim] = os.path.join(AFTRRAD_DIR, "SIMDATA/{}/Formatting/{}.vcf".format(sim, sim))
sim_vcf_dict["ddocent_full-"+sim] = os.path.join(DDOCENT_DIR, "SIMDATA/{}/{}_fastqs/TotalRawSNPs.vcf".format(sim, sim))
sim_vcf_dict["ddocent_filt-"+sim] = os.path.join(DDOCENT_DIR, "SIMDATA/{}/{}_fastqs/Final.recode.vcf".format(sim, sim))
In [37]:
for k, f in sim_vcf_dict.items():
## If it's gzipped then unzip it (only applies to ipyrad)
if os.path.exists(f + ".gz"):
print("gunzipping - {}".format(f+".gz"))
cmd = "gunzip -c {}.gz > {}".format(f, f)
!$cmd
f = f.split(".gz")[0]
sim_vcf_dict[k] = f
if os.path.exists(f):
print("found - {}".format(f))
if "stacks" in k:
## The version of vcftools we have here doesn't like the newer version of vcf stacks
## writes, so we have to 'fix' the stacks vcf version
shutil.copy2(f, f+".bak")
with open(f) as infile:
dat = infile.readlines()[1:]
with open(f, 'w') as outfile:
outfile.write("##fileformat=VCFv4.1\n")
outfile.write("".join(dat))
## Actually don't do this, it removes information about the assembly.
## I don't know why i wanted to do this in the first place. Habit?
## try:
## ## Remove all but biallelic.
## outfile = f.split(".vcf")[0]+"-biallelic"
## cmd = "{}vcftools --vcf {} --min-alleles 2 --max-alleles 2 --recode --out {}" \
## .format(DDOCENT_DIR, f, outfile)
## #print(cmd)
## os.system(cmd)
##
## ## update the vcf_dict
## sim_vcf_dict[k] = outfile + ".recode.vcf"
## except Exception as inst:
## print(inst)
else:
print("not found - {}".format(f))
Get number of loci per sample, and locus coverage for each vcf file from each assembler and each simulated dataset. This is reading in from the vcf files from each assembly method. This is nice because vcf is relatively standard and all the tools can give us a version of vcf. It's not perfect though because it doesn't include information about monomorphic sites, so it doesn't tell us the true number of loci recovered. We can get an idea of coverage and depth at snps, but to get coverage and depth stats across all loci we need to dig into the guts of the output of each method (which we'll do later).
In [38]:
import collections
sim_loc_cov = collections.OrderedDict()
sim_snp_cov = collections.OrderedDict()
sim_sample_nsnps = collections.OrderedDict()
sim_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in sim_vcf_dict.items():
try:
print("Doing - {}".format(prog))
print("\t{}".format(filename))
v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)
sim_snp_cov[prog] = snp_coverage(c)
sim_sample_nsnps[prog] = sample_nsnps(c)
## aftrRAD doesn't retain the kind of info we'd need for the next two
if "aftrrad" in prog:
continue
sim_loc_cov[prog] = loci_coverage(v, c, prog)
sim_sample_nlocs[prog] = sample_nloci(v, c, prog)
except Exception as inst:
print(inst)
In [496]:
res_dict = {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\
"sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}
res = WORK_DIR + "RESULTS/"
if not os.path.exists(res):
os.mkdir(res)
for k,v in res_dict.items():
df = pd.DataFrame(v)
df.to_csv(res + k + ".csv")
In [497]:
for k,v in res_dict.items():
df = pd.read_csv(res + k + ".csv")
print(df)
This isn't very helpful, but you get an idea of what's going on. Values here for stacks gapped analysis look weird because they are bad. The populations program was segfaulting during creation of the vcf, so it's super truncated. Not sure what the problem is. The results down below the plots pull in stats from other files so they are better.
In [39]:
for statname, stat in {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\
"sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}.items():
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]:
try:
print(prog+sim + " " + statname + "\t"),
print(stat[prog + sim]),
print(np.mean(stat[prog + sim]))
except:
print("No {} stats for {}".format(statname, prog + sim))
print("------------------------------------------------------")
In [61]:
## Load the calldata into a dict so we don't have to keep loading and reloading it
calldata = {}
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \
"stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]:
print("{}".format(prog+sim)),
print("{}".format(sim_vcf_dict[prog+sim]))
c = vcfnp.calldata_2d(sim_vcf_dict[prog+sim], verbose=False).view(np.recarray)
calldata[prog+sim] = c
In [617]:
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
sim_sample_names = pop1 + pop2 + pop3
flt = np.in1d(np.array(sim_sample_names), pop1)
print(flt)
In [619]:
## Some housekeeping with sample names to make the PCA plots prettier
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
sim_sample_names = pop1 + pop2 + pop3
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
#for sim in ["-simhi"]:
print("Doing - {}".format(sim))
f, axarr = plt.subplots(2, 4, figsize=(16,8), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \
"stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"], axarr):
## Annoying debug messages
#print("{}".format(prog+sim)),
#print("{}".format(sim_vcf_dict[prog+sim]))
coords1, model1 = getPCA(calldata[prog+sim])
x = coords1[:, 0]
y = coords1[:, 1]
ax.scatter(x, y, marker='o')
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
for pop in pops.keys():
flt = np.in1d(np.array(sim_sample_names), pops[pop])
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)
ax.set_title(prog+sim, style="italic")
ax.axison = True
f.tight_layout()
In [65]:
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
#for sim in ["-simhi"]:
print("Doing - {}".format(sim))
f, axarr = plt.subplots(2, 4, figsize=(16,8), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \
"stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"], axarr):
print("{}".format(prog+sim)),
print("{}".format(sim_vcf_dict[prog+sim]))
## Calculate pairwise distances
dist = getDistances(calldata[prog+sim])
## Doing it this way works, but allel uses imshow internally which rasterizes the image
#allel.plot.pairwise_distance(dist, labels=None, ax=ax, colorbar=False)
## Create the pcolormesh by hand
dat = ensure_square(dist)
## for some reason np.flipud(dat) is chopping off one row of data
p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\
cmap="jet", vmin=np.min(dist), vmax=np.max(dist))
## Clip all heatmaps to actual sample size
p.axes.axis("tight")
ax.set_title(prog+sim, style="italic")
ax.axison = False
What we want to know is total number of loci recovered (not just variable loci). First we'll create some dictionaries just like above, but we'll call them full to indicate that they include monomorphic sites. Because snps don't occur in monomorphic sites kind of by definition, we only really are iterested in the depth across loci and the number of loci recovered per sample.
In [71]:
sim_full_loc_cov = collections.OrderedDict()
sim_full_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in sim_vcf_dict.items():
print("Doing - {}\t".format(prog)),
sim_full_loc_cov[prog] = []
sim_full_sample_nlocs[prog] = []
In [407]:
## ipyrad stats
IPYRAD_SIMOUT=os.path.join(IPYRAD_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
print("Doing - {}\t".format(sim)),
simstring = "ipyrad-" + sim
simdir = os.path.join(IPYRAD_SIMOUT, sim)
statsfile = simdir + "/{}_outfiles/{}_stats.txt".format(sim, sim)
infile = open(statsfile).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:32]]
#print(sample_coverage)
print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
sim_full_sample_nlocs[simstring] = sample_coverage
nmissing = [int(x.strip().split()[1]) for x in infile[38:50]]
sim_full_loc_cov[simstring] = nmissing
## Just look at the ones we care about for ipyrad
print([(x,y) for x,y in sim_full_loc_cov.items() if "ipyrad" in x])
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "ipyrad" in x])
In [406]:
## ipyrad stats
PYRAD_SIMOUT=os.path.join(PYRAD_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
simstring = "pyrad-"+sim
print("Doing - {}\t".format(sim)),
simdir = os.path.join(PYRAD_SIMOUT, sim)
statsfile = simdir + "/stats/c85d6m2p3H3N3.stats"
infile = open(statsfile).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[8:20]]
print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
sim_full_sample_nlocs[simstring] = sample_coverage
nmissing = [0] + [int(x.strip().split()[1]) for x in infile[26:37]]
sim_full_loc_cov[simstring] = nmissing
## Just look at the ones we care about for pyrad
print([(x,y) for x,y in sim_full_loc_cov.items() if "pyrad" in x])
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "pyrad" in x])
In [111]:
import gzip
## stacks stats
STACKS_SIMOUT=os.path.join(STACKS_DIR, "SIMDATA/")
STACKS_GAP_SIMOUT=os.path.join(STACKS_SIMOUT, "gapped/")
STACKS_UNGAP_SIMOUT=os.path.join(STACKS_SIMOUT, "ungapped/")
STACKS_DEFAULT_SIMOUT=os.path.join(STACKS_SIMOUT, "default/")
for dir in [STACKS_GAP_SIMOUT, STACKS_UNGAP_SIMOUT, STACKS_DEFAULT_SIMOUT]:
stacks_method = dir.split("/")[-2]
for sim in ["simno", "simlo", "simhi", "simlarge"]:
#print("Doing - {}-{}".format(stacks_method, sim)),
simstring = "stacks_"+stacks_method+"-"+sim
try:
simdir = os.path.join(dir, sim)
lines = open("{}/batch_1.haplotypes.tsv".format(simdir)).readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
sim_full_loc_cov[simstring] = [cnts.count(i) for i in range(1,13)]
except Exception as inst:
print("loc_cov - {} - {}".format(inst, simdir))
try:
sim_full_sample_nlocs[simstring] = []
samp_haps = glob.glob("{}/*matches*".format(simdir))
for f in samp_haps:
lines = gzip.open(f).readlines()
sim_full_sample_nlocs[simstring].append(len(lines) - 1)
except Exception as inst:
print("sample_nlocs - {} - {}".format(inst, simdir))
## Just look at the stacks results to reduce the clutter
print([(x,y) for x,y in sim_full_loc_cov.items() if "stacks" in x])
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "stacks" in x])
AftrRAD doesn't natively support vcf. the vcf files we made are in sim*/Formatting/sim*.vcf
It's not straightforward how to get the locations for snps from the output files. aftrRAD does calculate and write these out as part of the Genotype.pl phase to a file called /TempFiles/SNPLocationsToPlot.txt
It's also not straightforward how to get the number of loci recovered for each invidual.
You have to total up the counts from two different files for each individual. The
Outputs/Genotypes
directory has two files, one for Haplotypes
and one for Monomorphics
.
The haplotypes file is a giant matrix of concatenated snps per locus so you have to
search through each column to count up all the NA
's (missing data per locus per individual).
Then you have to count up the number of non-zero monomorphic sites.
In [192]:
AFTRRAD_SIMOUT=os.path.join(AFTRRAD_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
#for sim in ["simlo"]:
simstring = "aftrrad-{}".format(sim)
print("Doing - {}".format(simstring))
simdir = os.path.join(AFTRRAD_SIMOUT, sim)
## The `17` in the file name is the min % of samples w/ data
hapsfile = simdir + "/Output/Genotypes/Haplotypes_17.All.txt"
monofile = simdir + "/Output/Genotypes/Monomorphics_17.txt"
## Set low_memory=False or else simlarge crashes pandas
mono_df = pd.read_csv(monofile, delim_whitespace=True, header=0, low_memory=False)
haps_df = pd.read_csv(hapsfile, delim_whitespace=True, header=0, index_col=0, low_memory=False)
sample_coverage = {}
## Get the list of all the simulated sample names
samplenames = mono_df.columns.values[1:]
## Do haplotypes file first because it is phased so there are 2 rows per sample
## so we end up double counting NA's. If you read the # of monomorphics first
## then you have to figure out some way to prevent the double counting, here
## we just count twice and overwrite.
for row in haps_df.itertuples():
sample_coverage[row[0].split("Individual")[1]] = len(pd.Series(row).nonzero()[0])
for sname in samplenames:
## nonzero returns a tuple of which we are only interested in the first element
sample_coverage[sname] += len(mono_df[sname].nonzero()[0])
## To sort or not to sort the coverage_df?
print("mean sample coverage - {}".format(np.mean(sample_coverage.values()))),
print("min/max - {}/{}".format(np.min(sample_coverage.values()), np.max(sample_coverage.values())))
## Put them in the dict sorted by sample name
sim_full_sample_nlocs[simstring] = [sample_coverage[x] for x in sorted(sample_coverage.keys())]
## Now do locus coverage
## For the same reason as above, do the haplotypes file first. We create the tmp_counts
## counter and then make a new counter with the index being 1/2 to account for the
## doublecounting of NaN in the haplotypes file.
## Also, note that annoyingly pandas reads in "NA" as float('nan') rather than string "NA"
## which took me a _while_ to figure out.
hap_nonzero = collections.Counter(haps_df.count())
tmp_counts = collections.Counter(hap_nonzero)
hap_counts = collections.Counter()
for x in tmp_counts:
hap_counts.update({x/2:tmp_counts[x]})
## Get monomorphics
## Get the sum of non-zero elements per row (the fancy .ix slicing drops the
## first column which is the sequence)
mono_nonzero = (mono_df.ix[:, 1:] != 0).astype(int).sum(axis=1)
mono_counts = collections.Counter(mono_nonzero)
tot_counts = hap_counts + mono_counts
## Collections are unordered, and also any coverage level with no count
## will not be in the collection, so we have to order it and insert zeros at the appropriate locations
dat = []
for i in xrange(1,13):
try:
dat.append(tot_counts[i])
except:
dat.append(0)
sim_full_loc_cov[simstring] = dat
print("Locus coverage")
print([(x,y) for x,y in sim_full_loc_cov.items() if "aftrrad" in x])
print("Sample nloci")
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "aftrrad" in x])
Here's what's weird. The dDocent reference.fasta file which is generated by the cd-hit clustering looks like it contains the right number of clusters. Similarly, if you use samtools to view the sample Monomorphic loci obviously aren't going to end up in the final vcf. It is incredibly not straightforward how to actually get access to the monomorphic loci from the intermediary files that dDocent creates. The shortcuts that dDocent takes to run super fast mean it's really hard to get "stacks" of sequence data per locus (e.g. the ipyrad .loci file).-RG.bam
, and the mapped.bed
file has 10000 loci as well. But if you look at the Contigs that make it to the final vcf files (either TotalRawSNPs or Final.recode) there are ~100+ contigs missing. I'm sure freebayes is filtering loci, but i diff'd the loci in the bed file and the vcf, and looked at the sequences of one of the missing loci in the cat-RRG.bam file and they look more or less normal. It's not the mapping quality flaq (-m), because all sim seqs have high quality mapping. None of the other settings in the dDocent script or the default freebayes settings seem particularly conspicuous.
grep Contig TotalRawSNPs.vcf | cut -f 1 | uniq -c | wc -l
simno/simlo/simhi - 9887/9889/9890
For simlarge there was a much bigger difference between the TotalRaw and Final.recode vcf files (98350/91620), TotalRaw recovers about 98.4% of true loci, but the final recode catches only about 92%.
It's also not super straightforward how to get # loci per sample from the dDocent output, so I had to gin something up. Right now because it uses samtools to view the bam file and then does some shell manipulation, this is a little slow (20 seconds per simulation treatment), but tolerable.
TODO: This is currently not tracking which samples correspond with the number of loci recovered bcz the ddocent bam files use the weird naming scheme and I haven't back calculated the good names yet.
This isn't exactly technically correct, because this is the counts of loci mapped for each individual, not the number of loci for each individual in the output!
Also, this is a little slow. Takes about 30-40 minutes.
In [ ]:
import subprocess
DDOCENT_SIMOUT=os.path.join(DDOCENT_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
#for sim in ["simlo"]:
simdir = os.path.join(DDOCENT_SIMOUT, sim + "/" + sim + "_fastqs/")
print("Doing {} - {}".format(sim, simdir))
## This is hackish. You have to dip into the bam file to get locus counts that
## include counts for monomorphics. Note! that field 3 of the bam file is not
## the sequence data, but rather the dDocent mock-contig ('dDocent_Contig_3')
## the read maps to. So really this is kind of by proxy counting the number of loci.
sample_coverage = []
for samp in glob.glob(simdir + "/*-RG.bam"):
cmd = "{}samtools view {} | cut -f 3 | uniq | wc -l".format(DDOCENT_DIR, samp)
res = subprocess.check_output(cmd, shell=True)
sample_coverage.append(int(res.strip()))
print("This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.")
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
print("Reading VCF")
vcf_filt = pd.read_csv("{}/Final.recode.vcf".format(simdir), delim_whitespace=True, header=60, index_col=0)
vcf_full = pd.read_csv("{}/TotalRawSNPs.vcf".format(simdir), delim_whitespace=True, header=60, index_col=0)
for vcf_string, vcf_df in zip(["full", "filt"], [vcf_full, vcf_filt]):
simstring = "ddocent_" + vcf_string + "-" + sim
print("Doing {}".format(simstring))
## Sample coverage is the same for both conditions here, because they are based on the
## same raw data.
idxs = set(vcf_df.index)
## Here we are going to simultaneously accumulate information about nloci per sample and locus coverage counts.
c = []
sample_cov_counter = collections.Counter()
for i, idx in enumerate(idxs):
## Poor man's progress bar. For the impatient.
if not i % 1000:
print(i),
## Complicated song and dance here because if the locus only has one snp you have to do
## some bullshit to reshape the dataframe.
tmp_df = vcf_df.loc[idx]
if isinstance(tmp_df, pd.Series):
tmp_df = tmp_df.to_frame().T
## The double apply is applying split() to all rows and columns to get the genotype data
nonzero_samps = (tmp_df.iloc[:, 8:20].apply(lambda x: x.apply(lambda y: y.split(":")[0])) != "./.").\
astype(int).sum().nonzero()[0]
sample_cov_counter.update(nonzero_samps)
c.append(len(nonzero_samps))
## The double apply is applying split() to all rows and columns to get the genotype data
# count = len((tmp_df.iloc[:, 8:20].\
# apply(lambda x: x.apply(lambda y: y.split(":")[0])) != "./.").astype(int).sum().nonzero()[0])
# c.append(count)
counts = collections.Counter(c)
## Fill in zero values that aren't in the locus counter
dat = []
for i in xrange(1,13):
try:
dat.append(counts[i])
except:
dat.append(0)
sim_full_loc_cov[simstring] = dat
sim_full_sample_nlocs[simstring] = sample_cov_counter.values()
# Here's a different stupid way to do this...
#
# sample_counts = {}
# sample_names = list(vcf.columns)[8:]
# print(sample_names)
# print("num loci - {}".format(len(set(vcf.index))))
# for name in sample_names:
# print(name)
# sample_counts[name] = 0
# for locus in set(vcf.index):
# snps = vcf[name][locus]
# if any(map(lambda x: x.split(":")[0] != "./.", snps)):
# sample_counts[name] += 1
# else:
# pass
# #print("{} {} {}".format(name, locus, snps))
# print(sample_counts)
print("Locus coverage")
print([(x,y) for x,y in sim_full_loc_cov.items() if "ddocent" in x])
print("Sample nloci")
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "ddocent" in x])
In [502]:
import pickle
sim_res_dict = {"sim_sample_nlocs_df":sim_full_sample_nlocs, "sim_full_loc_cov":sim_full_loc_cov}
for k,v in sim_res_dict.items():
print(k)
pickle.dump(v, open(WORK_DIR + "RESULTS/" + k + ".p", 'wb'))
In [443]:
simlevel = ["simno", "simlo", "simhi", "simlarge"]
assemblers = ["ipyrad", "pyrad", "stacks_gapped", "stacks_ungapped",\
"stacks_default", "aftrrad", "ddocent_full", "ddocent_filt"]
sim_sample_nlocs_df = pd.DataFrame(index=assemblers, columns=simlevel)
for sim in simlevel:
for ass in assemblers:
simstring = "-".join([ass, sim])
sim_sample_nlocs_df[sim][ass] = np.mean(sim_full_sample_nlocs[simstring])
print("Mean number of loci recovered per sample.")
## Normalize all bins
dat = sim_sample_nlocs_df[sim_sample_nlocs_df.columns].astype(float)
for sim in simlevel:
scale = 10000
if sim == "simlarge":
scale = 100000
dat[sim] = dat[sim]/scale
sns.heatmap(dat, square=True, center=1, linewidths=2, annot=True)
print(sim_sample_nlocs_df)
#print(dat)
In [552]:
simlevel = ["simno", "simlo", "simhi", "simlarge"]
assemblers = ["ipyrad", "pyrad", "stacks_gapped", "stacks_ungapped",\
"stacks_default", "aftrrad", "ddocent_full", "ddocent_filt"]
df = pd.DataFrame(index=assemblers, columns=simlevel)
df2 = pd.DataFrame(columns = ["assembler", "simtype"] + [x for x in xrange(1,13)])
df3 = pd.DataFrame(columns = ["assembler", "simtype", "simdata"])
## Try to get a dataframe in the shape seaborn will be happy with
dfsns = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"])
for sim in simlevel:
for ass in assemblers:
simstring = ass + "-" + sim
df[sim][ass] = np.array(sim_full_loc_cov[simstring])
df2.loc[simstring] = [ass, sim] + sim_full_loc_cov[simstring]
df3.loc[simstring] = [ass, sim, np.array(sim_full_loc_cov[simstring])]
for i, val in enumerate(sim_full_loc_cov[simstring]):
## Normalize values so different sim sizes print the same
max = 10000.
if "large" in simstring:
max = 100000.
dfsns.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val/max]
sns.set(style="white")
## This kinda sucks, too spread out.
#g = sns.FacetGrid(dfsns, row="assembler", col="simtype", margin_titles=True)
#g.map(sns.barplot, "bin", "simdata", color="steelblue", lw=0)
plt.rcParams['figure.figsize']=(20,20)
g = sns.FacetGrid(dfsns, col="simtype", margin_titles=True, col_wrap=2, size=4, aspect=2)
g.map(sns.barplot, "bin", "simdata", "assembler", lw=0.5, palette="bright").add_legend()
Out[552]:
In [553]:
## Barplots of assembly method x simdata type
g = sns.FacetGrid(dfsns, col="assembler", margin_titles=True, col_wrap=4, size=3, aspect=1.5)
g.map(sns.barplot, "bin", "simdata", "simtype", lw=0.5, palette="bright").add_legend()
Out[553]:
In [598]:
dfsns2 = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"])
for sim in simlevel:
for ass in assemblers:
simstring = ass + "-" + sim
## Normalize values so different sim sizes print the same
max = 10000.
if "large" in simstring:
max = 100000.
newdat = sim_full_loc_cov[simstring]
newdat = [sum(newdat)-sum(newdat[:i-1]) for i in range(1,13)]
for i, val in enumerate(newdat):
dfsns2.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val]
g = sns.FacetGrid(dfsns2, col="simtype", hue="assembler", sharex=True, sharey=False, size=4, col_wrap=2)
g.map(plt.scatter, "bin", "simdata")
g.map(plt.plot, "bin", "simdata").add_legend()
axs = g.axes
for ax in axs:
ax.set_xlim(0,12.5)
In [603]:
vcf_dict = {}
vcf_dict["ipyrad"] = os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA.vcf")
vcf_dict["pyrad"] = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf")
vcf_dict["stacks_ungapped"] = os.path.join(STACKS_DIR, "REALDATA/ungapped/batch_1.vcf")
vcf_dict["stacks_gapped"] = os.path.join(STACKS_DIR, "REALDATA/gapped/batch_1.vcf")
vcf_dict["stacks_default"] = os.path.join(STACKS_DIR, "REALDATA/default/batch_1.vcf")
vcf_dict["aftrrad"] = os.path.join(AFTRRAD_DIR, "REALDATA/Formatting/REALDATA.vcf")
vcf_dict["ddocent_full"] = os.path.join(DDOCENT_DIR, "REALDATA/TotalRawSNPs.vcf")
vcf_dict["ddocent_filt"] = os.path.join(DDOCENT_DIR, "REALDATA/Final.recode.vcf")
for k, f in vcf_dict.items():
if os.path.exists(f):
print("found - {}".format(f))
## If it's gzipped then unzip it (only applies to ipyrad i think)
if ".gz" in f:
print("gunzipping")
cmd = "gunzip {}".format(f)
!cmd
vcf_dict[k] = f.split(".gz")[0]
## Remove all but biallelic (for ipyrad this also removes all the monomorphic)
#outfile = f.split(".vcf")[0]+"-biallelic"
#cmd = "{}vcftools --vcf {} --min-alleles 2 --max-alleles 2 --recode --out {}" \
# .format(DDOCENT_DIR, f, outfile)
#print(cmd)
#!$cmd
## update the vcf_dict
#vcf_dict[k] = outfile + ".recode.vcf"
else:
print("not found - {}".format(f))
In [604]:
emp_loc_cov = collections.OrderedDict()
emp_snp_cov = collections.OrderedDict()
emp_sample_nsnps = collections.OrderedDict()
emp_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in vcf_dict.items():
try:
print("Doing - {}".format(prog))
print("\t{}".format(filename))
v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
emp_loc_cov[prog] = loci_coverage(v, c, prog)
emp_snp_cov[prog] = snp_coverage(c)
emp_sample_nsnps[prog] = sample_nsnps(c)
emp_sample_nlocs[prog] = sample_nloci(v, c, prog)
plotPCA(c, prog)
plotPairwiseDistance(c, prog)
except Exception as inst:
print(inst)
In [608]:
for i in emp_sample_nlocs:
print(i),
print(emp_sample_nlocs[i])
In [70]:
IPYRAD_EMPIRICAL_OUTPUT=os.path.join(IPYRAD_DIR, "REALDATA/")
IPYRAD_STATS = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA_stats.txt")
infile = open(IPYRAD_STATS).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:33]]
print(sample_coverage)
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
Read the biallelic vcf
In [71]:
filename = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA.biallelic.vcf")
# filename = vcf_dict["ipyrad"]
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
Getting variable sites and parsimony informative sites from the vcf is kind of annoying because all the programs export slightly different formats, so you need to parse them in slightly different ways. There's a better way to do this for ipyrad but i figure i'll do it the same way for all of them so it's more clear what's happening.
In [72]:
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
Out[72]:
In [113]:
## Have to gunzip the ipyrad REALDATA vcf
plotPCA(c, "ipyrad")
In [114]:
plotPairwiseDistance(c, "pyrad")
In [206]:
PYRAD_EMPIRICAL_OUTPUT=os.path.join(PYRAD_DIR, "REALDATA/")
PYRAD_STATS = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "stats/c85d6m2p3H3N3.stats")
infile = open(PYRAD_STATS).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[8:21]]
print(sample_coverage)
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
Be careful because sometimes the pyrad vcf gets a newline inserted between the last line of metadata and the first line of genotypes, which causes vcfnp to silently fail. Also there are a couple flags you can pass to vcfnp for debugging, but the print hella data to the console.
v = vcfnp.variants(filename, progress=1, verbose=True).view(np.recarray)
In [230]:
filename = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
In [231]:
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
Out[231]:
In [12]:
STACKS_OUTPUT=os.path.join(STACKS_DIR, "REALDATA/")
STACKS_GAP_OUT=os.path.join(STACKS_OUTPUT, "gapped/")
STACKS_UNGAP_OUT=os.path.join(STACKS_OUTPUT, "ungapped/")
STACKS_DEFAULT_OUT=os.path.join(STACKS_OUTPUT, "default/")
#lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines()
#cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
#shigh = [cnts.count(i) for i in range(1,13)]
In [13]:
filename = os.path.join(STACKS_UNGAP_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
In [14]:
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x.split("_")[1]) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x.split("_")[1]) for x in v.ID])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
Out[14]:
In [15]:
plotPCA(c, "stacks ungapped")
In [16]:
plotPairwiseDistance(c, "stacks ungapped")
In [16]:
filename = os.path.join(STACKS_DEFAULT_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
In [17]:
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x.split("_")[1]) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x.split("_")[1]) for x in v.ID])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
toyplot.html.render(canvas, 'snp_positions.html')
canvas
Out[17]:
In [18]:
plotPCA(c, "stacks default")
In [12]:
plotPairwiseDistance(c, "stacks default")
Out[12]:
In [ ]:
AFTRRAD_OUTPUT=os.path.join(AFTRRAD_DIR, "REALDATA/")
In [ ]:
filename = os.path.join(STACKS_DEFAULT_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
NOT TESTED
It's not very straightforward how to extract information about whether each snp is informative or not, given the output files that aftrRAD generates. It does create a file with all the snp locations so we can use this to generate the distribution of variable sites, but the pis will be empty.
In [184]:
f = os.path.join(AFTRRAD_OUTPUT, "TempFiles/SNPLocationsToPlot.txt")
with open(f) as infile:
snplocs = infile.read().split()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([])
## Getting the distvar is easier
distvar = Counter(map(int, snplocs))
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
## toyplot.html.render(canvas, 'snp_positions.html')
canvas
Out[184]:
In [ ]:
import subprocess
DDOCENT_OUTPUT=os.path.join(DDOCENT_DIR, "REALDATA/")
os.chdir(DDOCENT_OUTPUT)
print("Getting max loci per sample from bam files.")
sample_coverage = []
for samp in glob.glob("*-RG.bam"):
cmd = "{}samtools view {} | cut -f 3 | uniq | wc -l".format(DDOCENT_DIR, samp)
res = subprocess.check_output(cmd, shell=True)
print("nloci {} - {}".format(res.strip(), samp))
sample_coverage.append(int(res.strip()))
print(sorted(sample_coverage, reverse=True))
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
#sim_coverage_df["ddocent_"+sim] = sample_coverage
vcf_filt = pd.read_csv("Final.recode.vcf", delim_whitespace=True, header=60, index_col=0)
vcf_full = pd.read_csv("TotalRawSNPs.vcf", delim_whitespace=True, header=60, index_col=0)
## There's gotta be a faster way to do this...
## It's really fuckin slow on real data, like hours.
for vcf in [vcf_full, vcf_filt]:
sample_counts = {}
sample_names = list(vcf.columns)[8:]
print(sample_names)
print("num loci in vcf - {}".format(len(set(vcf.index))))
for name in sample_names:
print(name)
sample_counts[name] = 0
for locus in set(vcf.index):
snps = vcf[name][locus]
if any(map(lambda x: x.split(":")[0] != "./.", snps)):
sample_counts[name] += 1
else:
pass
print(sample_counts)
print(sim_coverage_df)
In [171]:
## Basic difference btw Final.recode and TotalRawSNPs is that final recode
## only includes individuals with snps called in > 90% of samples
f1 = os.path.join(DDOCENT_OUTPUT, "TotalRawSNPs.vcf")
v_full = vcfnp.variants(f1, dtypes={"CHROM":"a24"}).view(np.recarray)
c_full = vcfnp.calldata_2d(f1).view(np.recarray)
f2 = os.path.join(DDOCENT_OUTPUT, "Final.recode.vcf")
v_filt = vcfnp.variants(f2, dtypes={"CHROM":"a24"}).view(np.recarray)
c_filt = vcfnp.calldata_2d(f2).view(np.recarray)
In [75]:
for dset in [[v_full, c_full], [v_filt, c_filt]]:
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), dset[1]["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(dset[0]["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
## Getting the distvar is easier
distvar = Counter([int(x) for x in v_filt.POS])
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
## toyplot.html.render(canvas, 'snp_positions.html')
canvas
In [82]:
#Tmp delete me
print(np.sum(distvar.values()))
print(np.sum(distpis.values()))
In [238]:
for dset in [[v_full, c_full], [v_filt, c_filt]]:
plotPCA(dset[1], "ddocent")
In [239]:
plotPairwiseDistance(c_full, "dDocent Full")
plotPairwiseDistance(c_filt, "dDocent Filtered")
In [584]:
## Testing for different vcf formats
prog = "stacks-simhi"
filename = "/home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1.vcf"
#prog = "dDocent-simhi"
#filename = "/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs.vcf"
#prog = "ipyrad-simlarge"
#filename = "/home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf"
prog = "pyrad-simhi"
filename = "/home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3.vcf"
v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
print("loc_cov", loci_coverage(v, c, prog))
print("snp_cov", snp_coverage(c))
print("snps", sample_nsnps(c))
print("nlocs", sample_nloci(v, c, prog))
In [49]:
## Maybe useful
print(v.ID)
print(len(v.ID))
print(len(set([x.split("_")[0] for x in v.ID])))
In [37]:
import itertools
import numpy.ma as ma
print(v.ID)
print(len(v.ID))
loc_list = []
loci = set([x.split("_")[0] for x in v.ID])
print(len(loci))
loc_list = [list(val) for key,val in itertools.groupby(v.ID,key=lambda x:x.split("_")[0])]
print(len(loc_list))
## experimentation
print(v.dtype)
print(c.dtype)
print(np.mean(c.DP))
print(loc_list[0:10])
subsamp = np.array([np.random.choice(x, 1, replace=False)[0] for x in loc_list])
mask = np.in1d(v.ID, subsamp)
In [55]:
g = allel.GenotypeArray(cmask.genotype)
gn = g.to_n_alt()
m = allel.stats.rogers_huff_r(gn[:1000]) ** 2
ax = allel.plot.pairwise_ld(m)
In [ ]:
## Some crap from when i was counting loci a very stupid and slow way for stacks.
### This doesn't' count monomorphics, but could be useful still. It's dog slow.
#vcf = pd.read_csv("{}/batch_1.vcf".format(simdir), delim_whitespace=True, header=9, index_col=2)
#sample_counts = {}
#sample_names = list(vcf.columns)[8:]
#print(sample_names)
#uniq_loci = set([x.split("_")[0] for x in vcf.index])
#print("num loci - {}".format(len(uniq_loci)))
## Replace the ID index in the stacks vcf file so we can actually evaluate loci
#vcf.index = pd.Index([x.split("_")[0] for x in vcf.index])
#for name in sample_names:
# print(name),
# sample_counts[name] = 0
# for locus in uniq_loci:
# snps = vcf[name][locus]
# if any(map(lambda x: x.split(":")[0] != "./.", snps)):
# sample_counts[name] += 1
# else:
# pass
# #print("{} {} {}".format(name, locus, snps))
#print(sample_counts)
#sim_full_sample_nlocs["stacks_"+stacks_method+"-"+sim] = sample_coverage
In [191]:
filename = os.path.join(DDOCENT_OUTPUT, "Final.recode.snps.vcf")
v_filt = vcfnp.variants(filename).view(np.recarray)
c_filt = vcfnp.calldata_2d(filename).view(np.recarray)
In [220]:
from collections import Counter
print(v_filt.dtype)
#wat = Counter(vcall["POS"])
#wat
## DDOCENT
v_filt["is_snp"]
print(v_filt[0:2])
print(v_filt["NS"][:40])
print(v_filt["NUMALT"][:40])
print(v_filt["ALT"][:40])
loci = set(v_filt["CHROM"])
In [200]:
print(c.dtype)
print(len(c))
print(c["GT"][0:10][:,0])
print((c["GT"][0:10000][:,0] == "./.").sum())
print(c["GT"][0:5] != "./.")
print((x != "./.").sum() for x in c["GT"][0:5])
from collections import Counter
loci_coverage = [x.sum() for x in c_filt["GT"].T != "./."]
print(loci_coverage)
In [599]:
#x = sim_loc_cov
#x = sim_snp_cov
#x = sim_sample_nsnps
x = sim_sample_nlocs
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]:
try:
print(prog+sim+"\t"),
print(x[prog+sim]),
print(np.mean(x[prog+sim]))
except:
print("No {}".format(prog+sim))
print("------------------------------------------------------")
In [338]:
if True:
keys = ["e", "b", "b", "c", "d", "e", "e", 'a']
values = [1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1]
print('two methods of splitting by group')
print('as list')
for k,v in zip(*npi.group_by(keys)(values)):
print(k, v)
print('as iterable')
g = npi.group_by(keys)
for k,v in zip(g.unique, g.split_sequence_as_iterable(values)):
print(k, list(v))
print('iterable as iterable')
for k, v in zip(g.unique, g.split_iterable_as_iterable(values)):
print(k, list(v))