In [2]:
%%bash
which R
which python
# Locations on percy.anu.edu.au
# export PATH=$PATH:"/mnt/rescued_fast/kevin/ws/gbsqc/tools/seqqs/":"/usr/local/src/samtools/misc/"
which wgsim
which pairs
# IPython notebook should be started within a virtual environment.
# After "workon <virtual_env>", check dependencies:
pip freeze
In [48]:
# Make sure we are working in <download_path>/brachy_khmer/doc/_tmp_dir at this point!
x = !pwd
if not x[0].endswith("_tmp_dir"):
!mkdir _tmp_dir
%cd _tmp_dir
else:
print "Already in detected _tmp_dir:"
!pwd
In [49]:
# A few variables...
N = 1000000 # Genome length to simulate
n = 250000 # number of read pairs to simulate
k = 15 # kmer length for hashing later on
In [5]:
# Generate a pseudorandom reference sequence
import random
random.seed(42) # Note we set the random seed -S to be 42.
seq_file = "test_ref.fa"
rand_seq = "".join([random.choice("ATCG") for i in range(N)])
with open(seq_file, "w") as outfile:
outfile.write(">seq1_len%d\n" % N)
# Wrap to line length
line_len = 80
split_seq = "\n".join([rand_seq[i:i+line_len] for i in range(0, N, line_len)])
outfile.write(split_seq)
# Reread and check length
!head "$seq_file"
x = !tail -n +2 "$seq_file"
assert(len("".join(x)) == N)
# Compress it
!gzip -vf "$seq_file"
In [6]:
# haplotype mode: no heterozygosity. All mutations are applied to one copy of the genome, and all reads are drawn from that copy.
!wgsim
In [7]:
%%bash -s "$n"
# Simulate read pairs of 100 bp reads at the default mutation and sequencing error rates.
# Note we set the random seed -S to be 42.
wgsim -N $1 -1 100 -2 100 -S 42 -e 0.00 test_ref.fa.gz test_reads.r1.fq test_reads.r2.fq > test_reads.geno
head test_reads.geno
# Now we interleave the read pairs.
pairs join test_reads.r1.fq test_reads.r2.fq | gzip -f > test_reads.il.fq.gz
In [8]:
# Here is the first pair of reads
!zcat test_reads.il.fq.gz | head -n 8
In [9]:
# Step 1.
# Hashes kmers in the reads file using a Count–min sketch (Ref: http://arxiv.org/abs/1309.2975).
# Output is a binary kmer hash table with fixed size rougly equal to N_TABLES * MIN_TABLESIZE bytes.
!load-into-counting.py
In [10]:
# Step 2.
# Counts abundance of kmers in hash table.
# Output is a kmer abundance-frequency table in text format.
!abundance-dist.py
In [11]:
%%bash -s "$k"
# Let's get into hashing.
# Parameters:
# kmer length
# Number of hash tables: each table has a different offset so that collisions can be resolved
# Size of each table: the larger the tables, the lower the collision chance
# Output filename
# Input interleaved FASTQ
load-into-counting.py --ksize $1 --n_tables 4 --min-tablesize 1e4 test_reads.k$1.htable test_reads.il.fq.gz
# Notice the high false positive (fp) rate and the error:
# fp rate estimated to be 1.000
# ** ERROR: the k-mer counting table is too small this data set. Increase tablesize/# tables.
# This is because we are only allocating 40 KB memory per table, which is far too low for the number of reads.
In [12]:
%%bash -s "$k"
# Increasing the parameters such that we now allocate 8 tables of 40 Mb each.
# NOTE: you will need to change this if you vary the k or n parameters.
# For a real dataset, determining the required table size is not trivial, as it depends on the number of unique kmers in your reads,
# which in turn depends on the number of reads, polymorphism rate etc.
# The official instruction from http://khmer.readthedocs.org/en/v1.0/choosing-table-sizes.html
# is to use as much memory as you have available.
# A good fp rate is well below 1%.
load-into-counting.py --ksize $1 --n_tables 8 --min-tablesize 4e7 test_reads.k$1.htable test_reads.il.fq.gz
In [13]:
%%bash -s "$k"
# Generate abundance frequency table
abundance-dist.py --no-zero --squash test_reads.k$1.htable test_reads.il.fq.gz test_reads.k$1.hist
In [14]:
%%bash -s "$k"
# Columns in the output are:
# kmer abundance
# freq of kmers at that abundance count
# cumulative abundance
# cumulative fraction of total distinct k-mers.
head test_reads.k$1.hist
In [50]:
# Calculate kmer coverage
# Source: http://dna.med.monash.edu.au/~torsten/velvet_advisor/
# All coverage values in Velvet are provided in k-mer coverage,
# i.e. how many times has a k-mer been seen among the reads.
# The relation between k-mer coverage Ck and standard (nucleotide-wise) coverage C is
# Ck = C * (L - k + 1) / L
# where k is your hash length, and L you read length.
# Source: https://groups.google.com/forum/#!topic/bgi-soap/xKS39Nz4SCE
# base_coverage_depth = total_base_num/genome_size = (read_num*read_length)/genome_size
# kmer_coverage_depth = total_kmer_num/genome_size=read_num*(read_length-kmer_size+1)/genome_size
# So the relationship between base coverage depth and kmer coverage depth is:
# kmer_coverage_depth = base_coverage_depth*(read_length-kmer_size+1)/read_length
def get_estimated_coverage(num_read_pairs, genome_len, L=100, k=25):
C = num_read_pairs * 100 * 2 / genome_len
Ck = C*(L - k + 1) / L
return C, Ck
print n, N, k
print get_estimated_coverage(num_read_pairs=n, genome_len=N, L=100, k=k)
# Given the genome the coverage is sufficiently high, we would expect the main peak to be centered at the kmer coverage.
In [51]:
# Plotting freq against abundance to produce the kmer spectra,
# using a wrapper script to set axis limits
!python ../../scripts/plot_hist.py 100 10 test_reads.k$k\.hist
from IPython.core.display import Image, display
Image(filename="test_reads.k{0}.png".format(k))
# We can see the characteristic shape of a kmer spectrum of a random genome:
# Poisson distributed main peak
# Low abundance error peak
# Why is the main peak shifted left from the expected coverage of 43?
# Because the sequencing error rate of -e 0.02 shifts counts towards the error peak.
Out[51]:
In [17]:
# Repeating the demo using -e 0.00 removes the error peak and causes the main peak to be centered closer to the expected value.
Image(filename="../test_data/test_reads.k15.test_reads.k15.noerror.png")
Out[17]:
We simulate sequencing data from genomes with different characteristics to see if we can detect a subgenome signal.
In [18]:
# Get a Brachy reference genome
!wget -nv https://doc-08-60-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/pq1qrefcuapcfa52g4pm3u32kuq69ad9/1415203200000/12528459609246823118/*/0B4t9UUDOrtFnZDJoZFBoZXRkN1U?e=download \
-O Bdistachyon_MIPS_1_2.fa.gz
In [19]:
%%bash
# Simulating reads from a polyploid genome with a subgenome with different mutation rate.
# sim_subgenomes.sh is is a wrapper script that uses wgsim
# Simulates 2.5M reads at a polymorphism rate of 0.005 (1/200), then simulates 2.5M reads at a polymorphism rate of 0.05 and merges.
# Parameters:
# Reference file .fa.gz
# Total number of read pairs and read length
# Sequencing error rate
# Number of subgenomes
# Respective proportion of reads from each subgenome
# Respective polymorphism rates
../../scripts/sim_subgenomes.sh Bdistachyon_MIPS_1_2.fa.gz \
5000000 100 \
0.02 \
2 \
0.5 0.5 \
0.05 0.005
In [20]:
%%bash
# Simulating diploid genome reads.
# Same as above, except both genomes have the same polymorphism rate of 0.005
../../scripts/sim_subgenomes.sh Bdistachyon_MIPS_1_2.fa.gz \
5000000 100 \
0.02 \
2 \
0.5 0.5 \
0.005 0.005
In [21]:
%%bash
# Hashing simulated polyploid reads at k=25 using wrapper script auto_hash_and_count.sh
# (Simulated) .il.fq.gz file
# 4 tables x 2e9 bytes = 8 Gb memory usage
# k=25
../../scripts/auto_hash_and_count.sh Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.05,0.005-.il.fq.gz \
4 2e9 \
25
In [22]:
!cat Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.05,0.005-.k25.htable.info
In [23]:
%%bash
# Hashing simulated diploid reads at k=25
# Same parameters as above
../../scripts/auto_hash_and_count.sh Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.005,0.005-.il.fq.gz \
4 2e9 \
25
In [24]:
!cat Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.005,0.005-.k25.htable.info
In [25]:
# Play around with different axis limits
!python ../../scripts/plot_hist.py 1000 10 \
Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.005,0.005-.k25.hist \
Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.05,0.005-.k25.hist
Image(filename="Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.005,0.005-.k25.Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.05,0.005-.k25.png")
# No peaks?!
Out[25]:
In [26]:
%%bash
# Is sequencing error obscuring the peak signal?
# Set error rate to 0.00
../../scripts/sim_subgenomes.sh Bdistachyon_MIPS_1_2.fa.gz \
5000000 100 \
0.00 \
2 \
0.5 0.5 \
0.05 0.005
# Same parameters as above
../../scripts/auto_hash_and_count.sh Bdistachyon_MIPS_1_2.-0.00-0.5,0.5-0.05,0.005-.il.fq.gz \
4 2e9 \
25
In [27]:
!python ../../scripts/plot_hist.py 1000 10 \
Bdistachyon_MIPS_1_2.-0.00-0.5,0.5-0.05,0.005-.k25.hist \
Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.05,0.005-.k25.hist
Image(filename="Bdistachyon_MIPS_1_2.-0.00-0.5,0.5-0.05,0.005-.k25.Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.05,0.005-.k25.png")
# Fewer kmers with high abundance due to higher error rate creating more unique kmers.
# Still no signal.
Out[27]:
In [47]:
# Calculate expected cov.
lm = !zcat Bdistachyon_MIPS_1_2.fa.gz | grep -v '^>' | wc -lm
genome_len = int(lm[0].split()[1]) - int(lm[0].split()[0])
print genome_len
print get_estimated_coverage(num_read_pairs=5e6, genome_len=genome_len)
In [29]:
# Lets zoom in around that region
!python ../../scripts/plot_hist.py 10 10 \
Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.005,0.005-.k25.hist \
Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.05,0.005-.k25.hist
Image(filename="Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.005,0.005-.k25.Bdistachyon_MIPS_1_2.-0.02-0.5,0.5-0.05,0.005-.k25.png")
Out[29]:
In [52]:
%cd ../test_data
In [31]:
# Bdis05 WGS reads
# 9.7 M read pairs
print get_estimated_coverage(num_read_pairs=9.7e6, genome_len=genome_len)
# We also hashed a hexaploid whole genome data set from the Vogel lab.
# Approx. 6M read pairs.
# Possibly Brachypodium hybridum ABR113?
# http://aob.oxfordjournals.org/content/early/2012/01/01/aob.mcr294.full
# Allotetraploid with 2n = 30 chromosomes (x = 10 + x = 5), derived from the cross between B. stacei and B. distachyon.
# Genome size of 1.265 pg/2C. => 978 Mb
print get_estimated_coverage(num_read_pairs=6e6, genome_len=1.265*978e6/2)
# Contains both relatively small B. stacei-type and large B. distachyon-type chromosomes.
# The name hybridum refers to its hybrid allotetraploid nature.
!python ../../scripts/plot_hist.py 4000 10 \
../test_data/bdis05/Bdis05-1_qcd.k25.hist \
../test_data/vogel_hexaploids/7424.2.73049.ACAGTG.k25.hist
Image(filename="Bdis05-1_qcd.k25.7424.2.73049.ACAGTG.k25.png")
# Interesting, but with such low coverage, peaks at high abundance are likely to be organelle DNA/simple repeats.
Out[31]:
Compared to WGS data, there are additional considerations with GBS data:
But:
We used two Brachypodium GBS datasets: bd1 (mostly diploid) and bd6 (mostly hexaploid). bd1 is a recombinant inbred line; bd6 is not, but the species is selfing, so low heterozygosity is expected in wild populations anyway.
In [32]:
%matplotlib inline
import matplotlib.pyplot as plt
# Can we improve kmer coverage by hashing at a lower k?
fig, ax = plt.subplots()
ax.set_xlabel("k")
ax.set_ylabel("C_k")
ax.plot(range(10, 30),
[get_estimated_coverage(num_read_pairs=4e6, genome_len=genome_len, k=k)[1]
for k in range(10, 30)])
plt.savefig("improve_ck.png", bbox_inches='tight')
# Not really, the range is small.
In [53]:
# We just hashed at k=12 because memory usage is much better.
# Only 400 Mb per sample.
!grep -E 'memory|fp|ms' ../test_data/bd1_gbs/bd_1E4.k12.log
!python ../../scripts/plot_hist.py 2500 10 \
../test_data/bd1_gbs/bd_1E4.k12.hist \
../test_data/bd6_gbs/bd_6D1.k12.hist
Image(filename="bd_1E4.k12.bd_6D1.k12.png")
print get_estimated_coverage(num_read_pairs=4e6, genome_len=genome_len)
# The hexaploid sample has a much lower count of unique kmers that have high abundance
One advantage of a smaller k: there are far fewer 12-mers than 25-mers. We can iterate over all kmers and select the most abundant kmers in one of the samples, then construct a counts matrix for that set of kmers over all samples, normalised by library size.
In [34]:
# File with all the samples we want to analyse:
# Path to sample .htable
# number of read pairs in the sample to normalise counts by.
# Another option would be to normalise by the total number of kmers instead,
# These numbers are reported in the hashing .log files:
# e.g. Total number of k-mers: 5508702
# The set of most abundant kmers is derived from the first sample in the list.
# Unfortunately this .htable data is not on Dropbox (~9.7 Gb)
!head bd_clustering/k12_tables.txt
In [35]:
%%bash
cd ../test_data/bd_clustering/
# Arguments:
# List of sample kmer hash tables .htable
# k=12
# After skipping the 200 most abundant kmers of first sample, analyse the next 100.
# In order for the matrix to be informative, a mean sample presence that is not too close to 100% is required.
python ../../../scripts/repeat_pres_abs.py k12_tables.txt 12 200 100
# In this case: "Mean sample presence proportion is 83.33%"
In [36]:
# Cluster samples (default dist measure="euclidean", hclust method="complete")
!Rscript ../../scripts/cluster_samples.R \
../test_data/bd_clustering/k12_tables.skip200.N100.pres_abs.csv \
k12_tables.skip200.N100_cluster.png
Image(filename="k12_tables.skip200.N100_cluster.png")
Out[36]:
In [41]:
# Change back to starting dir, ready to re-run
%cd ../
Work in progress at https://github.com/digitase/brachy_khmer