This is the de novo assembly pipeline used for the manuscript "Comparison of Two Whole-Genome Sequencing Methods for Analysis of Three Methicillin-Resistant Staphylococcus aureus Outbreaks." The pipeline takes sequencing reads from an isolate, specifically a pair of fastq files from paired-end sequencing, and a pair of fastq files from mate pair sequencing. The pipeline outputs contigs and scaffolds for the isolate.
As written in this notebook, this pipeline is used to assemble sequencing reads from a single isolate.
SPAdes
version 3.1.1 velvet
version 1.2.10Trimmomatic
version 0.32seqtk
KmerGenie
version 1.6741R
version 3.2.0The software dependencies of the above software will also be needed.
Sources for custom code and scripts are included elswhere in this repository.
You will need a name for the sample (stored in the ALIAS
variable), the location of the fastq files to assemble (stored in the PE_*
and MP_*
variables), and the target coverage for the data (stored in TARGET_COV
).
The values for the parameters in the different tools are set as used for the processing of isolate sequencing data in the manuscript.
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.
Copyright 2017, Mayo Foundation for Medical Education and Research.
In [4]:
ALIAS=my_sample_name
PE_1=/path/to/paired_end_read_1
PE_2=/path/to/paired_end_read_2
MP_1=/path/to/mate_pair_read_1
MP_2=/path/to/mate_pair_read_2
TARGET_COV=100.0
In [ ]:
TRIMMOMATIC_JAR=/path/to/trimmomatic-0.32.jar
java -jar $TRIMMOMATIC_JAR PE -threads $NSLOTS $PE_1 $PE_2 \
${ALIAS}_pe_R1.c.fq ${ALIAS}_pe_U1.c.fq \
${ALIAS}_pe_R2.c.fq ${ALIAS}_pe_U2.c.fq \
ILLUMINACLIP:Adapters.fasta:2:30:10 LEADING:3 TRAILING:3 MAXINFO:220:0.1 MINLEN:70
java -jar $TRIMMOMATIC_JAR PE -threads $NSLOTS $MP_1 $MP_2 \
${ALIAS}_mp1_R1.c.fq ${ALIAS}_mp_U1.c.fq \
${ALIAS}_mp1_R2.c.fq ${ALIAS}_mp_U2.c.fq \
ILLUMINACLIP:Adapters.fasta:2:30:10 LEADING:3 TRAILING:3 MAXINFO:220:0.1 MINLEN:70
#concatenate single reads
cat *U[12].c.fq > ${ALIAS}_se.i.fq
In [ ]:
for kind in pe mp
do
seqtk mergepe ${ALIAS}_${kind}_R1.c.fq ${ALIAS}_${kind}_R2.c.fq > ${ALIAS}_${kind}.i.fq
median_filter ${ALIAS}_${kind}.i.fq > ${ALIAS}_${kind}.mf.fq
extract_paired_reads ${ALIAS}_${kind}.mf.fq
done
In [ ]:
#concatenate single reads
cat *.se ${ALIAS}_se.mf.fq | gzip > ${ALIAS}_se.fq.gz
#renaming paired reads, compress
for kind in pe mp
do
mv ${ALIAS}_${kind}.mf.fq.pe ${ALIAS}_${kind}.fq
gzip ${ALIAS}_${kind}.fq
done
In [ ]:
spades.py -t $NSLOTS --only-error-correction -o spades_output \
--pe1-12 ${ALIAS}_pe.fq.gz --pe1-s ${ALIAS}_se.fq.gz \
--hqmp1-12 ${ALIAS}_mp1.fq.gz --hqmp2-12 ${ALIAS}_mp2.fq.gz
In [ ]:
cd spades_output/corrected
gunzip ${ALIAS}_*_*.00.*_0.cor.fastq.gz
${ALIAS}_pe_1.00.1_0.cor.fastq ${ALIAS}_pe_2.00.1_0.cor.fastq > ${ALIAS}_pe.ec.fq
${ALIAS}_mp_1.00.0_0.cor.fastq ${ALIAS}_mp_2.00.0_0.cor.fastq > ${ALIAS}_mp.ec.fq
mv *.ec.fq ../../
cd ../../
Normalize the expected coverage using specialk
from KmerGenie
, as well as it's R
scripts.
specialk
seqtk sample
, to match desired coverage.velvet
in the arrays COVS
and CUTOFFS
.
In [ ]:
R_SCRIPTS=/path/to/kmergenie/R_scripts
ls *.ec.fq > ecfiles.txt
specialk ecfiles.txt -t $NSLOTS -l 51 -k 51 -o ${ALIAS}or
cp $R_SCRIPTS/*.r .
ACTUAL_COV=$(Rscript get-peak.r ${ALIAS}or-k51.histo)
FRACTION=$(python -c "print $TARGET_COV/$ACTUAL_COV")
echo "COV=$TARGET_COV" > coverage.txt
seqtk sample -s $A_NUMBER_1 ${ALIAS}_pe.ec.fq $FRACTION > ${ALIAS}_pe.ec.keep.fq
seqtk sample -s $A_NUMBER_2 ${ALIAS}_mp.ec.fq $FRACTION > ${ALIAS}_mp.ec.keep.fq
seqtk sample -s $A_NUMBER_1 ${ALIAS}_se.ec.fq $FRACTION > ${ALIAS}_se.ec.keep.fq
ls *.ec.keep.fq > ecfiles.txt
specialk ecfiles.txt -t $NSLOTS -l 31 -k 121 -s 2 -o ${ALIAS}
echo "declare -A COVS" >> coverage.txt
echo "declare -A CUTOFFS" > cutoffs.txt
#add one by one to the coverages files
for kmer in $(seq 31 2 121)
do
MY_COV=$(Rscript get-peak.r ${ALIAS}-k${kmer}.histo)
MY_CUTOFF=$(Rscript get-cutoff.r ${ALIAS}-k${kmer}.histo)
COVS[$kmer]=$MY_COV
CUTOFFS[$kmer]=$MY_CUTOFF
done
In [ ]:
seqtk seq -r ${ALIAS}_mp.ec.keep.fq > ${ALIAS}_mp.ec.keep.rc.fq
In [ ]:
velveth velvet_seqs 33 -noHash -create_binary -fastq.gz \
-shortPaired $CLEANDIR/${ALIAS}_pe.ec.keep.fq.gz \
-shortPaired2 $CLEANDIR/${ALIAS}_mp.ec.keep.rc.fq.gz \
-short3 $CLEANDIR/${ALIAS}_se.ec.keep.fq.gz
In [ ]:
for KMER in $(seq 31 2 121)
do
mkdir velvet_$KMER
cd velvet_$KMER
ln -s ../velvet_seqs/CnyUnifiedSeq
ln -s ../velvet_seqs/CnyUnifiedSeq.names
cd ..
#hash the reads
OMP_NUM_THREADS=$NSLOTS velveth velvet_$KMER $KMER -reuse_binary
#assemble the reads
MY_COV=${COVS[$KMER]}
OMP_NUM_THREADS=$NSLOTS velvetg velvet_$KMER -scaffolding no \
-exp_cov $MY_COV -cov_cutoff ${CUTOFFS[$KMER]} -read_trkg yes \
-ins_length 500 -ins_length2 8000 -min_contig_lgth 1000
done
In [ ]:
BEST_VELVET_CONTIGS=$(assembly_useful_stats.py --no_header velvet_*/contigs.fa|sort -nrk8|head -n1|cut -f 10)
In [ ]:
spades.py -t $NSLOTS --only-assembler -o spades_final_output -k 21,33,55,77,99,127 \
--pe1-12 $CLEANDIR/${ALIAS}_pe.ec.keep.fq.gz --pe1-s $CLEANDIR/${ALIAS}_se.ec.keep.fq.gz \
--hqmp1-12 $CLEANDIR/${ALIAS}_mp.ec.keep.fq.gz \
--untrusted-contigs $BEST_VELVET_CONTIGS
In [ ]:
CONTIGS=spades_final_output/contigs.fasta
SCAFFOLDS=spades_final_output/scaffolds.fasta