In [17]:
%run ../../../shared_setup.ipynb
In [2]:
tbl_sequences = (
etl.wrap([
['ID', 'clone', 'fasta'],
['birren_hb3_contigs', 'HB3', '/data/plasmodium/pfalciparum/pf-crosses/data/reference/birren_2006/plasmodium_falciparum__isolate_hb3__1_contigs.fasta'],
['birren_hb3_supercontigs', 'HB3', '/data/plasmodium/pfalciparum/pf-crosses/data/reference/birren_2006/plasmodium_falciparum__isolate_hb3__1_supercontigs.fasta'],
['birren_dd2_contigs', 'Dd2', '/data/plasmodium/pfalciparum/pf-crosses/data/reference/birren_2006/plasmodium_falciparum__isolate_dd2__1_contigs.fasta'],
['birren_dd2_supercontigs', 'Dd2', '/data/plasmodium/pfalciparum/pf-crosses/data/reference/birren_2006/plasmodium_falciparum__isolate_dd2__1_supercontigs.fasta'],
['birren_7g8_contigs', '7G8', '/data/plasmodium/pfalciparum/pf-crosses/data/reference/birren_2006/plasmodium_falciparum__isolate_7g8__1_contigs.fasta'],
['birren_7g8_supercontigs', '7G8', '/data/plasmodium/pfalciparum/pf-crosses/data/reference/birren_2006/plasmodium_falciparum__isolate_7g8__1_supercontigs.fasta'],
['garimella_3d7_ERR019061_contigs', '3D7', '/data/plasmodium/pfalciparum/pf-crosses/data/kiran_assemblies/20150413/3D7xHB3.3D7.PG0051-C.ERR019061.fasta'],
['garimella_hb3_ERR019054_contigs', 'HB3', '/data/plasmodium/pfalciparum/pf-crosses/data/kiran_assemblies/20150413/3D7xHB3.HB3.PG0052-C.ERR019054.fasta'],
['garimella_hb3_ERR012788_contigs', 'HB3', '/data/plasmodium/pfalciparum/pf-crosses/data/kiran_assemblies/20150413/HB3xDD2.HB3.PG0004-CW.ERR012788.fasta'],
['garimella_dd2_ERR012840_contigs', 'Dd2', '/data/plasmodium/pfalciparum/pf-crosses/data/kiran_assemblies/20150413/HB3xDD2.DD2.PG0008-CW.ERR012840.fasta'],
['garimella_7g8_ERR027099_contigs', '7G8', '/data/plasmodium/pfalciparum/pf-crosses/data/kiran_assemblies/20150413/7G8xGB4.7G8.PG0083-C.ERR027099.fasta'],
['garimella_gb4_ERR027099_contigs', 'GB4', '/data/plasmodium/pfalciparum/pf-crosses/data/kiran_assemblies/20150413/7G8xGB4.GB4.PG0084-C.ERR027100.fasta'],
['genbank_hb3_coding_sequences', 'HB3', '/data/plasmodium/pfalciparum/pf-crosses/data/reference/genbank/hb3_coding_sequences.fasta'],
])
.addfield('n_contigs', lambda row: len(pyfasta.Fasta(row.fasta, key_fn=lambda k: k.split()[0])))
)
tbl_sequences.displayall()
In [3]:
%%file /data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/align_bwasw_default.sh
#!/bin/bash
set -e
set -u
set -o pipefail
set -x
# arguments
target=$1
query=$2
outdir=$3
prefix=$4
# programs
bwa=/data/plasmodium/pfalciparum/pf-crosses/opt/bwa-0.7.12/bwa
samtools=/data/plasmodium/pfalciparum/pf-crosses/opt/samtools-1.2/samtools
# ensure target and query have been indexed
if [ ! -f ${target}.sa ]
then
$bwa index $target
fi
if [ ! -f ${query}.sa ]
then
$bwa index $query
fi
# perform alignment to SAM
samfile=${outdir}/${prefix}.sam
if [ ! -f ${samfile}.ok ]
then
$bwa bwasw -f $samfile -t4 $target $query
touch ${samfile}.ok
fi
# convert to BAM
bamfile=${outdir}/${prefix}.bam
if [ ! -f ${bamfile}.ok ]
then
$samtools sort -O bam -o $bamfile -T ${bamfile}.tmp $samfile
$samtools index $bamfile
touch ${bamfile}.ok
fi
In [4]:
%%file /data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/align_bwamem_default.sh
#!/bin/bash
set -e
set -u
set -o pipefail
set -x
# arguments
target=$1
query=$2
outdir=$3
prefix=$4
# programs
bwa=/data/plasmodium/pfalciparum/pf-crosses/opt/bwa-0.7.12/bwa
samtools=/data/plasmodium/pfalciparum/pf-crosses/opt/samtools-1.2/samtools
# ensure target and query have been indexed
if [ ! -f ${target}.sa ]
then
$bwa index $target
fi
if [ ! -f ${query}.sa ]
then
$bwa index $query
fi
# perform alignment to SAM
samfile=${outdir}/${prefix}.sam
if [ ! -f ${samfile}.ok ]
then
$bwa mem $target $query > $samfile
touch ${samfile}.ok
fi
# convert to BAM
bamfile=${outdir}/${prefix}.bam
if [ ! -f ${bamfile}.ok ]
then
$samtools sort -O bam -o $bamfile -T ${bamfile}.tmp $samfile
$samtools index $bamfile
touch ${bamfile}.ok
fi
In [5]:
%%file /data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/align_bwamem_intractg.sh
#!/bin/bash
set -e
set -u
set -o pipefail
set -x
# arguments
target=$1
query=$2
outdir=$3
prefix=$4
# programs
bwa=/data/plasmodium/pfalciparum/pf-crosses/opt/bwa-0.7.12/bwa
samtools=/data/plasmodium/pfalciparum/pf-crosses/opt/samtools-1.2/samtools
# ensure target and query have been indexed
if [ ! -f ${target}.sa ]
then
$bwa index $target
fi
if [ ! -f ${query}.sa ]
then
$bwa index $query
fi
# perform alignment to SAM
samfile=${outdir}/${prefix}.sam
if [ ! -f ${samfile}.ok ]
then
$bwa mem -x intractg $target $query > $samfile
touch ${samfile}.ok
fi
# convert to BAM
bamfile=${outdir}/${prefix}.bam
if [ ! -f ${bamfile}.ok ]
then
$samtools sort -O bam -o $bamfile -T ${bamfile}.tmp $samfile
$samtools index $bamfile
touch ${bamfile}.ok
fi
In [6]:
%%file /data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/align_lastz_default.sh
#!/bin/bash
set -e
set -u
set -o pipefail
set -x
# arguments
target=$1
query=$2
outdir=$3
prefix=$4
# programs
bwa=/data/plasmodium/pfalciparum/pf-crosses/opt/bwa-0.7.12/bwa
samtools=/data/plasmodium/pfalciparum/pf-crosses/opt/samtools-1.2/samtools
lastz=/data/plasmodium/pfalciparum/pf-crosses/opt/lastz/lastz-distrib-1.03.66/bin/lastz
# perform alignment to SAM
samfile=${outdir}/${prefix}.sam
if [ ! -f ${samfile}.ok ]
then
$lastz ${target}[multiple] $query --queryhspbest=1 --format=sam --output=$samfile --progress=1
touch ${samfile}.ok
fi
# convert to BAM
bamfile=${outdir}/${prefix}.bam
if [ ! -f ${bamfile}.ok ]
then
$samtools sort -O bam -o $bamfile -T ${bamfile}.tmp $samfile
$samtools index $bamfile
touch ${bamfile}.ok
fi
In [7]:
%%file /data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/call_bcftools_consensus.sh
#!/bin/bash
set -e
set -u
set -o pipefail
set -x
# arguments
fasta=$1
bamfile=$2
outdir=$3
prefix=$4
# programs
samtools=/data/plasmodium/pfalciparum/pf-crosses/opt/samtools-1.2/samtools
bcftools=/data/plasmodium/pfalciparum/pf-crosses/opt/bcftools-1.2/bcftools
java=/usr/lib/jvm/java-7-openjdk-amd64/bin/java
gatk_jar=/data/plasmodium/pfalciparum/pf-crosses/opt/GenomeAnalysisTK-3.3-0/GenomeAnalysisTK.jar
gatk="$java -Xmx2G -jar $gatk_jar -R $fasta"
# create samples file
samplesfile=${outdir}/${prefix}.samples
echo -e "${bamfile}\t1" > $samplesfile
# perform calling
vcffile_raw_bgz=${outdir}/${prefix}.raw.vcf.gz
vcffile_leftaligned=${outdir}/${prefix}.leftaligned.vcf
indicator=${outdir}/${prefix}.ok
if [ ! -f $indicator ]
then
$samtools mpileup \
--uncompressed \
--vcf \
--count-orphans \
--no-BAQ \
--ignore-overlaps \
--fasta-ref $fasta $bamfile \
| $bcftools call - \
--consensus-caller \
--output-type v \
--samples-file $samplesfile \
--variants-only \
| bgzip -c > $vcffile_raw_bgz
tabix -p vcf $vcffile_raw_bgz
$gatk -T LeftAlignAndTrimVariants -V $vcffile_raw_bgz -o $vcffile_leftaligned
bgzip $vcffile_leftaligned
tabix -p vcf ${vcffile_leftaligned}.gz
touch $indicator
fi
In [8]:
%%file /data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/call_bcftools_multiallelic.sh
#!/bin/bash
set -e
set -u
set -o pipefail
set -x
# arguments
fasta=$1
bamfile=$2
outdir=$3
prefix=$4
# programs
samtools=/data/plasmodium/pfalciparum/pf-crosses/opt/samtools-1.2/samtools
bcftools=/data/plasmodium/pfalciparum/pf-crosses/opt/bcftools-1.2/bcftools
java=/usr/lib/jvm/java-7-openjdk-amd64/bin/java
gatk_jar=/data/plasmodium/pfalciparum/pf-crosses/opt/GenomeAnalysisTK-3.3-0/GenomeAnalysisTK.jar
gatk="$java -Xmx2G -jar $gatk_jar -R $fasta"
# create samples file
samplesfile=${outdir}/${prefix}.samples
echo -e "${bamfile}\t1" > $samplesfile
# perform calling
vcffile_raw_bgz=${outdir}/${prefix}.raw.vcf.gz
vcffile_leftaligned=${outdir}/${prefix}.leftaligned.vcf
indicator=${outdir}/${prefix}.ok
if [ ! -f $indicator ]
then
$samtools mpileup \
--uncompressed \
--vcf \
--count-orphans \
--no-BAQ \
--ignore-overlaps \
--fasta-ref $fasta $bamfile \
--open-prob 0 \
--ext-prob 0 \
--tandem-qual 0 \
| $bcftools call - \
--multiallelic-caller \
--output-type v \
--samples-file $samplesfile \
--variants-only \
--prior 0 \
| bgzip -c > $vcffile_raw_bgz
tabix -p vcf $vcffile_raw_bgz
$gatk -T LeftAlignAndTrimVariants -V $vcffile_raw_bgz -o $vcffile_leftaligned
bgzip $vcffile_leftaligned
tabix -p vcf ${vcffile_leftaligned}.gz
touch $indicator
fi
In [9]:
%%file /data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/call_gatk_ug.sh
#!/bin/bash
set -e
set -u
set -o pipefail
set -x
# arguments
fasta=$1
bamfile=$2
outdir=$3
prefix=$4
# programs
java=/usr/lib/jvm/java-7-openjdk-amd64/bin/java
gatk_jar=/data/plasmodium/pfalciparum/pf-crosses/opt/GenomeAnalysisTK-3.3-0/GenomeAnalysisTK.jar
gatk="$java -Xmx2G -jar $gatk_jar -R $fasta"
picard=/data/plasmodium/pfalciparum/pf-crosses/opt/picard-tools-1.77
samtools=/data/plasmodium/pfalciparum/pf-crosses/opt/samtools-1.2/samtools
# add read groups for GATK
fixedbamfile=${outdir}/${prefix}.fixed.bam
$java -Xmx2g -jar ${picard}/AddOrReplaceReadGroups.jar \
INPUT=$bamfile \
OUTPUT=$fixedbamfile \
ID=$prefix \
LB=$prefix \
PL=unknown \
PU=$prefix \
SM=$prefix \
VALIDATION_STRINGENCY=LENIENT
$samtools index $fixedbamfile
# perform calling
vcffile_raw=${outdir}/${prefix}.raw.vcf
vcffile_leftaligned=${outdir}/${prefix}.leftaligned.vcf
indicator=${outdir}/${prefix}.ok
if [ ! -f $indicator ]
then
$gatk -T UnifiedGenotyper \
-I $fixedbamfile \
-o $vcffile_raw \
-stand_emit_conf 0 \
-stand_call_conf 0 \
-ploidy 1 \
-glm BOTH \
--defaultBaseQualities 30 \
-mbq 1 \
-minIndelCnt 1
$gatk -T LeftAlignAndTrimVariants -V $vcffile_raw -o $vcffile_leftaligned
bgzip $vcffile_leftaligned
tabix -p vcf ${vcffile_leftaligned}.gz
touch $indicator
fi
In [10]:
%%file /data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/call_freebayes.sh
#!/bin/bash
set -e
set -u
set -o pipefail
set -x
# arguments
fasta=$1
bamfile=$2
outdir=$3
prefix=$4
# programs
java=/usr/lib/jvm/java-7-openjdk-amd64/bin/java
gatk_jar=/data/plasmodium/pfalciparum/pf-crosses/opt/GenomeAnalysisTK-3.3-0/GenomeAnalysisTK.jar
gatk="$java -Xmx2G -jar $gatk_jar -R $fasta"
picard=/data/plasmodium/pfalciparum/pf-crosses/opt/picard-tools-1.77
samtools=/data/plasmodium/pfalciparum/pf-crosses/opt/samtools-1.2/samtools
freebayes=/data/plasmodium/pfalciparum/pf-crosses/opt/freebayes/bin/freebayes
# add read groups for GATK
fixedbamfile=${outdir}/${prefix}.fixed.bam
$java -Xmx2g -jar ${picard}/AddOrReplaceReadGroups.jar \
INPUT=$bamfile \
OUTPUT=$fixedbamfile \
ID=$prefix \
LB=$prefix \
PL=unknown \
PU=$prefix \
SM=$prefix \
VALIDATION_STRINGENCY=LENIENT
$samtools index $fixedbamfile
# perform calling
vcffile_raw=${outdir}/${prefix}.raw.vcf
vcffile_leftaligned=${outdir}/${prefix}.leftaligned.vcf
indicator=${outdir}/${prefix}.ok
if [ ! -f $indicator ]
then
$freebayes -f $fasta \
--ploidy 1 \
--min-alternate-count 1 \
--min-alternate-fraction 0 \
--hwe-priors-off \
--binomial-obs-priors-off \
--allele-balance-priors-off \
--report-genotype-likelihood-max \
--haplotype-length 0 \
--use-reference-allele --reference-quality 0,0 \
--legacy-gls \
$fixedbamfile > $vcffile_raw
$gatk -T LeftAlignAndTrimVariants -V $vcffile_raw -o $vcffile_leftaligned
bgzip $vcffile_leftaligned
tabix -p vcf ${vcffile_leftaligned}.gz
touch $indicator
fi
In [12]:
tbl_alignment_configurations = etl.wrap([
['ID', 'script'],
['bwasw_default', '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/align_bwasw_default.sh'],
['bwamem_default', '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/align_bwamem_default.sh'],
['bwamem_intractg', '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/align_bwamem_intractg.sh'],
# ['lastz_default', '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/align_lastz_default.sh'],
])
In [13]:
tbl_calling_configurations = etl.wrap([
['ID', 'script'],
['bcftools_consensus', '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/call_bcftools_consensus.sh'],
['bcftools_multiallelic', '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/call_bcftools_multiallelic.sh'],
['gatk_ug', '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/call_gatk_ug.sh'],
# ['freebayes', '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/scripts/call_freebayes.sh'],
])
In [14]:
truth_dir = '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/truth'
In [15]:
target = '/data/plasmodium/pfalciparum/pf-crosses/data/genome/sanger/version3/September_2012/Pf3D7_v3.lookseq.fa'
In [15]:
for prefix, clone, query, _ in tbl_sequences.data():
log(prefix, clone, query)
for config, script in tbl_alignment_configurations.data():
log(config, script)
outdir = os.path.join(truth_dir, config, 'alignment')
if not os.path.exists(outdir):
os.makedirs(outdir)
!(bash {script} {target} {query} {outdir} {prefix} > {outdir}/{prefix}.log 2>&1 &)
In [16]:
for prefix, _, _, _ in tbl_sequences.data():
log(prefix)
for alignment_config, _ in tbl_alignment_configurations.data():
log(alignment_config)
bamfile = os.path.join(truth_dir, alignment_config, 'alignment', prefix + '.bam')
log(bamfile)
for calling_config, script in tbl_calling_configurations.data():
log(calling_config, script)
outdir = os.path.join(truth_dir, alignment_config, 'calling', calling_config)
if not os.path.exists(outdir):
os.makedirs(outdir)
!(bash {script} {target} {bamfile} {outdir} {prefix} > {outdir}/{prefix}.log 2>&1 &)
In [143]:
prefix, clone, query, _ = tbl_sequences[7]
prefix, clone, query
Out[143]:
In [144]:
alignment_config = 'lastz_default'
alignment_outdir = os.path.join(truth_dir, alignment_config, 'alignment')
if not os.path.exists(alignment_outdir):
os.makedirs(alignment_outdir)
!ls -lh {alignment_outdir}
In [145]:
bamfile = os.path.join(alignment_outdir, prefix + '.bam')
!ls -lh {bamfile}*
In [146]:
_, alignment_script = tbl_alignment_configurations[4]
alignment_script
Out[146]:
In [98]:
!(bash {alignment_script} {target} {query} {alignment_outdir} {prefix} > {alignment_outdir}/{prefix}.log 2>&1 &)
In [247]:
prefix, clone, query, _ = tbl_sequences[7]
prefix, clone, query
Out[247]:
In [248]:
alignment_config = 'bwamem_intractg'
alignment_outdir = os.path.join(truth_dir, alignment_config, 'alignment')
if not os.path.exists(alignment_outdir):
os.makedirs(alignment_outdir)
bamfile = os.path.join(alignment_outdir, prefix + '.bam')
!ls -lh {bamfile}*
In [249]:
calling_config, calling_script = tbl_calling_configurations[4]
calling_config, calling_script
Out[249]:
In [250]:
calling_outdir = os.path.join(truth_dir, alignment_config, 'calling', calling_config)
if not os.path.exists(calling_outdir):
os.makedirs(calling_outdir)
!ls -lh {calling_outdir}
In [251]:
#!rm -v {calling_outdir}/*
In [252]:
!(bash {calling_script} {target} {bamfile} {calling_outdir} {prefix} > {calling_outdir}/{prefix}.log 2>&1 &)
In [19]:
prefix = 'genbank_hb3_coding_sequences'
truth_dir = '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/truth'
alignment_config = 'bwamem_intractg'
alignment_outdir = os.path.join(truth_dir, alignment_config, 'alignment')
bam_fn = os.path.join(alignment_outdir, prefix + '.bam')
!ls -lh {bam_fn}
In [20]:
bam = pysam.AlignmentFile(bam_fn)
bam
Out[20]:
In [33]:
seq = pyfasta.Fasta(target)['Pf3D7_01_v3']
seq
Out[33]:
In [41]:
for col in bam.pileup(region='Pf3D7_01_v3:265200-269174', stepper='nofilter'):
ref_base = seq[col.reference_pos]
for read in col.pileups:
if read.indel < 0:
ref_allele = seq[col.reference_pos:col.reference_pos + -1*read.indel + 1]
alt_allele = ref_base
print(col.reference_pos + 1, ref_allele, alt_allele)
elif read.indel > 0:
ref_allele = ref_base
alt_allele = read.alignment.query_sequence[read.query_position:read.query_position + read.indel + 1]
print(col.reference_pos + 1, ref_allele, alt_allele)
elif not read.is_del and ref_base != read.alignment.query_sequence[read.query_position]:
ref_allele = ref_base
alt_allele = read.alignment.query_sequence[read.query_position]
print(col.reference_pos + 1, ref_allele, alt_allele)
In [24]:
for read in bam.fetch(region='Pf3D7_01_v3:263208-271173'):
print(read)
print(dir(read))
for op in read.cigar:
print(op)
In [ ]: