In [36]:
## import libraries
import ipyrad as ip ## ipyrad
import numpy as np ## array operations
import h5py ## access hdf5 database file
import toyplot ## my fav new plotting library
import toyplot.html ## toypot sublib for saving html plots
## print versions for posterity
print 'ipyrad', ip.__version__
print 'numpy', np.__version__
print 'h5py', h5py.__version__
print 'toyplot', toyplot.__version__
I assembled the data set under three minimum cluster depth settings (6, 10, 20), and import their Assembly objects as data1, data2, and data3. These are ipyrad Assembly class objects which have many features and functions available to them. To see these type the object name (e.g., data1) followed by a period (.) and then press tab to see list of all the available options.
In [37]:
## import Assembly objects
data = ip.load_json("/home/deren/Downloads/pedicularis/pedictrim5.json")
We also open the database file for each data set. This is the file with the suffix ".hdf5" that should have a file name like: "[project_dir]/[name]_consens/[name].hdf5". The assembly class objects save the database file path under the attribute ".database", which can be used to access it more easily. Below we open a view to the hdf5 database file for each of the three assemblies.
In [38]:
## load the hdf5 database
io5 = h5py.File(data.database, 'r')
In [39]:
print 'location of my database file:\n ', data.database
print 'keys in the hdf5 database\n ', io5.keys()
In [40]:
## This doesn't actually load them into memory, they can be very large.
## It just makes a reference for calling keys more easily
#hcatg = io5["catgs"] ## depth information (not edge filtered)
#hseqs = io5["seqs"] ## sequence data (not edge filtered)
hsnps = io5["snps"] ## snp locations (edge filtered)
hfilt = io5["filters"] ## locus filters
hedge = io5["edges"] ## edge filters
## arrays shapes and dtypes
#print hcatg
#print hseqs
print hsnps
print hfilt
print hedge
In [41]:
def filter_snps(data):
## get h5 database
io5 = h5py.File(data.database, 'r')
hsnps = io5["snps"] ## snp locations
hfilt = io5["filters"] ## locus filters
hedge = io5["edges"] ## edge filters
## read in a local copy of the full snps and edge arrays
snps = hsnps[:]
edge = hedge[:]
## print status
print "prefilter {}\nshape {} = (nloci, maxlen, [var,pis])"\
.format(data.name, snps.shape)
print "total vars = {}".format(snps[:,:,0].sum())
## apply edge filters to all loci in the snps array
for loc in xrange(snps.shape[0]):
a, b = edge[loc, :2]
mask = np.invert([i in range(a, b) for i in np.arange(snps.shape[1])])
snps[loc, mask, :] = 0
## get locus filter by summing across all filters
locfilter = hfilt[:].sum(axis=1).astype(np.bool)
## apply filter to snps array
fsnps = snps[~locfilter, ...]
## print new shape and sum
print "postfilter {}\nshape {} = (nloci, maxlen, [var,pis])"\
.format(data.name, fsnps.shape)
print "total vars = {}".format(fsnps[:,:,0].sum())
## clean up big objects
del snps
del edge
## return what we want
return fsnps
In [42]:
def filter_snps(data):
## get h5 database
io5 = h5py.File(data.database, 'r')
hsnps = io5["snps"] ## snp locations
hfilt = io5["filters"] ## locus filters
#hedge = io5["edges"] ## edge filters
## read in a local copy of the full snps and edge arrays
snps = hsnps[:]
#edge = hedge[:]
## get locus filter by summing across all filters
locfilter = hfilt[:].sum(axis=1).astype(np.bool)
## apply filter to snps array
fsnps = snps[~locfilter, ...]
## clean up big objects
del snps
## return what we want
return fsnps
In [43]:
fsnps.sum(axis=0)
Out[43]:
In [44]:
## apply filter to each data set
fsnps = filter_snps(data)
In [45]:
a = np.arange(0, 40)#.reshape(10,4)
b = np.arange(10, 50)#.reshape(10, 4)
#np.concatenate((a,b), axis=1)
np.array([a,b]).max(axis=0)
Out[45]:
In [46]:
## the last dimension has two columns (var, pis)
## how many snps per locus
varlocs = fsnps[:, :, 0].sum(axis=1)
pislocs = fsnps[:, :, 1].sum(axis=1)
print varlocs[:5]
## how many snps per site (across loci)
persite = fsnps[:, :, :].sum(axis=0)
print persite[10:15]
In [47]:
colormap = toyplot.color.Palette()
colormap
Out[47]:
In [48]:
## deconstruct array into bins
vbars, vbins = np.histogram(varlocs, bins=range(0, varlocs.max()+2))
pbars, pbins = np.histogram(pislocs, bins=range(0, varlocs.max()+2))
## setup canvas and axes
canvas = toyplot.Canvas(width=350, height=300)
axes = canvas.cartesian(xlabel="n variable (or pis) sites",
ylabel="n nloci w/ n var (or pis) sites",
gutter=50)
## set up x axis
axes.x.domain.max = 16
axes.x.spine.show = False
axes.x.ticks.labels.style = {"baseline-shift":"10px"}
axes.x.ticks.locator = toyplot.locator.Explicit(
range(0, 16, 2),
map(str, range(0, 16, 2)))
## set up y axis
axes.y.ticks.show=True
axes.y.label.style = {"baseline-shift":"35px"}
axes.y.ticks.labels.style = {"baseline-shift":"5px"}
axes.y.ticks.below = 0
axes.y.ticks.above = 5
axes.y.domain.min = 0
axes.y.domain.max = 10000
axes.y.ticks.locator = toyplot.locator.Explicit(
range(0, 11000, 2500),
map(str, range(0, 11000, 2500)))
## add bars
axes.bars(vbars, color=colormap[1], opacity=0.5)
axes.bars(pbars, color=colormap[0], opacity=0.5)
## or as a filled/smoothed plot
#x = np.arange(0, len(pbars))
#fill = axes.fill(x, vbars, color=colormap[0], opacity=0.5)
#fill = axes.fill(x, pbars, color=colormap[1], opacity=0.5)
Out[48]:
In [49]:
## the snps array is longer than the actual seq length (it's a bit padded)
## and so we want to lop the extra on the end off. Let's get the max values w/ data.
maxend = np.where(fsnps[:, :, :].sum(axis=0).sum(axis=1) != 0)[0].max()
## all variables (including autapomorphies)
distvar = np.sum(fsnps[:, :maxend+1, 0].astype(np.int), axis=0)
print(distvar)
## synapomorphies (just pis)
distpis = fsnps[:, :maxend+1, 1].sum(axis=0)
print(distpis)
## how long is the longest seq
print 'maxlen = ', maxend
In [50]:
def SNP_position_plot(distvar, distpis):
## set color theme
colormap = toyplot.color.Palette()
## make a canvas
canvas = toyplot.Canvas(width=800, height=300)
## make axes
axes = canvas.cartesian(xlabel="Position along RAD loci",
ylabel="N variables sites",
gutter=65)
## x-axis
axes.x.ticks.show = True
axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"}
axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"}
axes.x.ticks.below = 5
axes.x.ticks.above = 0
axes.x.domain.max = maxend
axes.x.ticks.locator = toyplot.locator.Explicit(
range(0, maxend, 5),
map(str, range(0, maxend, 5)))
## y-axis
axes.y.ticks.show=True
axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"}
axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"}
axes.y.ticks.below = 0
axes.y.ticks.above = 5
## add fill plots
x = np.arange(0, maxend+1)
f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites")
f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites")
## add a horizontal dashed line at the median Nsnps per site
axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"})
axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"})
return canvas, axes
In [51]:
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
toyplot.html.render(canvas, 'snp_positions.html')
## show fig
canvas
#axes
Out[51]:
I think the more likely explanation is that poor alignment towards the end of reads is causing the excess SNPs for two reasons. First, if it was sequencing errors than we would expect an excess of autapomorphies at the end of reads equally or more so than we observe synapomorphies, but that is not the case. And second, increasing the minimum depth does fix the problem.
In [ ]:
## make a new branch of cyatho-d6-min4
data5 = data1.branch('cyatho-d6-min4-trim')
## edge filter order is (R1left, R1right, R2left, R2right)
## we set the R1right to -10, which trims 10 bases from the right side
data5.set_params('edge_filter', ("4, -10, 4, 4"))
## run step7 to fill new data base w/ filters
data5.run('7', force=True)
In [169]:
data5 = ip.load_json("~/Downloads/pedicularis/cyatho-d12-min4")
In [170]:
## filter the snps
fsnps = filter_snps(data5)
## trim non-data
maxend = np.where(fsnps[:, :, :].sum(axis=0).sum(axis=1) != 0)[0].max()
## all variables (including autapomorphies)
distvar = np.sum(fsnps[:, :maxend+1, 0].astype(np.int), axis=0)
print(distvar)
## synapomorphies (just pis)
distpis = fsnps[:, :maxend+1, 1].sum(axis=0)
print(distpis)
## how long is the longest seq
print 'maxlen = ', maxend
In [171]:
SNP_position_plot(distvar, distpis)
Out[171]:
I found early on that leaving the cut site attached to the left side of reads improved assemblies by acting as an achor, which then allowed gap openings to arise in the early parts of the read but not to be treated differently, i.e., as terminal gap openings. For some reason it didn't occur to me to similarly anchor the right side of reads. Let's see what happens if I had an invariant anchor to the right side of reads.
Another point to note, though I don't show the results here, this increase in variation along the length of reads is not observed in simulated data, suggesting it is inherent to real data.
In [15]:
simdata = ip.load_json("~/Documents/ipyrad/tests/cli/cli.json")
## filter the snps
fsnps = filter_snps(simdata)
## trim non-data
maxend = np.where(fsnps[:, :, :].sum(axis=0).sum(axis=1) != 0)[0].max()
## all variables (including autapomorphies)
distvar = np.sum(fsnps[:, :maxend+1, 0].astype(np.int), axis=0)
## synapomorphies (just pis)
distpis = fsnps[:, :maxend+1, 1].sum(axis=0)
SNP_position_plot(distvar, distpis)
Out[15]:
In [172]:
## import the new version of ipyrad w/ this update.
import ipyrad as ip
print ip.__version__
In [ ]: