In [51]:
# directory where you want the spacer blasting to be done
## CHANGE THIS!
workDir = "/home/nyoungb2/t/CLdb_Ecoli/"
In [52]:
import os
from IPython.display import FileLinks
%load_ext rpy2.ipython
In [53]:
blastDir = os.path.join(workDir, 'arrayBlast')
if not os.path.isdir(blastDir):
os.makedirs(blastDir)
In [85]:
!cd $blastDir; \
CLdb -- array2fasta -cluster -cut 1 > spacers_cut1.fna
## getting number of spacer sequences
!printf 'Number of unique spacers:\t'
!cd $blastDir; \
grep -c ">" spacers_cut1.fna; \
echo; \
head -n 6 spacers_cut1.fna
In [81]:
!CLdb -- arrayBlast -h
In [82]:
# listing all arrayBlast subcommands
!CLdb -- arrayBlast --list
In [83]:
# spacer blast
## Really just a thin wrapper around blastn that will blast the spacers against all genomes in CLdb
### If you want to blast some else, just run the blast yourself (use 'perldoc CLdb_arrayBlast.pl' for more info)
!cd $blastDir; \
CLdb -- arrayBlast -- run \
-query spacers_cut1.fna \
> spacers_cut1_blastn.xml
Note:
In [84]:
# converting .xml to .srl
## This retains all of the info in the xml, but produces a smaller file that is faster to load and write.
## For large blast runs (xml files > 100 Mb), this will take a little while
!cd $blastDir; \
CLdb -- arrayBlast -- xml2srl \
spacers_cut1_blastn.xml \
> spacers_cut1_blastn.srl
# checking output file
!cd $blastDir; \
ls -thlc spacers_cut1_blastn.*
Note:
CLdb -- arrayBlast -- run
directly into CLdb -- arrayBlast -- xml2srl
CLdb -- arrayBlast -- run -query spacers_cut1.fna | CLdb -- arrayBlast -- xml2srl
In [86]:
# selecting DRs, so we can blast them.
## This is needed blast filter out spacers that hit CRISPR arrays
!cd $blastDir; \
CLdb -- array2fasta \
-cluster -cut 1 -r \
> DRs_cut1.fna
## getting number of spacer sequences
!printf 'Number of unique DRs:\t'
!cd $blastDir; \
grep -c ">" DRs_cut1.fna; \
echo; \
head -n 6 DRs_cut1.fna
In [87]:
# DR blast, just like spacer blast
!cd $blastDir; \
CLdb -- arrayBlast -- run \
-query DRs_cut1.fna \
> DRs_cut1_blastn.xml
In [88]:
# converting .xml to .srl
## This retains all of the info in the xml, but produces a smaller file that is faster to load and write.
## For large blast runs (xml files > 100 Mb), this will take a little while
!cd $blastDir; \
CLdb -- arrayBlast -- xml2srl \
DRs_cut1_blastn.xml \
> DRs_cut1_blastn.srl
# checking output file
!cd $blastDir; \
ls -thlc DRs_cut1_blastn.*
In [89]:
# filtering spacer hits to arrays
## This is based on the premise that DR hits adjacent to spacer hits signify that the spacer is hitting a CRISPR array
!cd $blastDir; \
CLdb -- arrayBlast -- filter_arrayHits \
spacers_cut1_blastn.srl \
DRs_cut1_blastn.srl \
> spacers_cut1_blastn_filt.srl
# checking output file
!cd $blastDir; \
echo; \
ls -thlc spacers_cut1_blastn_filt.srl
In [90]:
!cd $blastDir; \
CLdb -- arrayBlast -- srl2csv \
spacers_cut1_blastn_filt.srl \
> spacers_cut1_blastn_filt.txt
# assessing table of blast hits
!cd $blastDir; \
head spacers_cut1_blastn_filt.txt
In [91]:
#set the number of threads to use for your machine
num_threads=20
In [92]:
## spacer blast with blastn
!cd $blastDir; \
blastn -task blastn-short -db nt -query spacers_cut1.fna \
-outfmt 5 -evalue 1e-5 -num_threads $num_threads > spacers_cut1_vNT.xml
## checking output
!cd $blastDir; \
ls -thlc spacers_cut1_vNT.xml
In [93]:
## spacer blast with blastn
!cd $blastDir; \
blastn -task blastn-short -db nt -query DRs_cut1.fna \
-outfmt 5 -evalue 1e-5 -num_threads $num_threads > DRs_cut1_vNT.xml
## checking output
!cd $blastDir; \
ls -thlc DRs_cut1_vNT.xml
In [94]:
## spacers
!cd $blastDir; \
CLdb -- arrayBlast -- xml2srl \
spacers_cut1_vNT.xml \
> spacers_cut1_vNT.srl
## DRs
!cd $blastDir; \
CLdb -- arrayBlast -- xml2srl \
DRs_cut1_vNT.xml \
> DRs_cut1_vNT.srl
## checking output
!cd $blastDir; \
ls -thlc *_vNT.srl
In [95]:
# filtering spacer hits to arrays
## This is based on the premise that DR hits adjacent to spacer hits signify that the spacer is hitting a CRISPR array
!cd $blastDir; \
CLdb -- arrayBlast -- filter_arrayHits \
spacers_cut1_vNT.srl \
DRs_cut1_vNT.srl \
> spacers_cut1_vNT_blastn_filt.srl
## checking output file
!cd $blastDir; echo; \
ls -thlc spacers_cut1_vNT_blastn_filt.srl
Great! It's looks like we have some hits to some putative protospacers!
In [98]:
# The srl file can be converted to a json or csv file if you would like to use tools external to CLdb.
!cd $blastDir; \
CLdb -- arrayBlast -- srl2json \
< spacers_cut1_vNT_blastn_filt.srl \
> spacers_cut1_vNT_blastn_filt.json
## viewing some of the json file
!cd $blastDir; \
cat spacers_cut1_vNT_blastn_filt.json | \
python -mjson.tool 2>/dev/null | head -n 40
In [100]:
!cd $blastDir; \
CLdb -- arrayBlast -- add_crRNA \
< spacers_cut1_vNT_blastn_filt.srl \
> spacers_cut1_vNT_blastn_filt_crDNA.srl
## checking output
!cd $blastDir; echo; \
ls -thlc spacers_cut1_vNT_blastn_filt_crDNA.srl
In [102]:
!cd $blastDir; \
CLdb -- arrayBlast -- add_proto -workers 24 \
< spacers_cut1_vNT_blastn_filt_crDNA.srl \
> spacers_cut1_vNT_blastn_filt_crDNA_proto.srl
## checking output
!cd $blastDir; echo; \
ls -thlc spacers_cut1_vNT_blastn_filt_crDNA_proto.srl
In [106]:
# aligning crRNA and protospacer
!cd $blastDir; \
CLdb -- arrayBlast -- align_proto \
< spacers_cut1_vNT_blastn_filt_crDNA_proto.srl \
> spacers_cut1_vNT_blastn_filt_crDNA_proto_aln.srl
## checking output
!cd $blastDir; echo; \
ls -thlc spacers_cut1_vNT_blastn_filt_crDNA_proto_aln.srl
CLdb -- arrayBlast --perldoc -- get_align
for info on what -outfmt
does
In [108]:
# getting alignment of protospacer and crDNA and writing as a fasta
!cd $blastDir; \
CLdb -- arrayBlast -- get_align \
-outfmt taxon_name,subtype,evalue \
< spacers_cut1_vNT_blastn_filt_crDNA_proto_aln.srl \
> vNT_crDNA-proto_aln.fna
## checking output
!cd $blastDir; echo; \
ls -thlc vNT_crDNA-proto_aln.fna
In [109]:
# let's take a look at the sequences
!cd $blastDir; \
head -n 10 vNT_crDNA-proto_aln.fna
Example:
>crDNA
ATAGACA
>proto
ATAGACA
Note: the lower case letters are sequence adjacent to the actual protospacer
If you wanted each crRNA-protospacer pair individually, you could use the split
bash command like this:
WARNING: this produces a lot of files, so we will make them in a new directory
In [118]:
# file/directory
inFile = os.path.join(blastDir, 'vNT_crDNA-proto_aln.fna')
outDir = os.path.join(blastDir, 'vNT_crDNA-proto_aln_split')
if not os.path.isdir(outDir):
os.makedirs(outDir)
outPre = os.path.join(outDir, 'split')
# spliting alignment
!split -l 4 -d -a 4 $inFile $outPre
# checking out files
!printf 'Number of files produced: '
!find $outDir -name "split*" | wc -l
In [123]:
## Let look at one of them
!find $outDir -name "split*" 2>/dev/null | head -n 1 | xargs -I % cat % 2>/dev/null
In [132]:
!cd $blastDir; \
CLdb -- arrayBlast -- get_PAM -PAM 2 4 -f - \
< vNT_crDNA-proto_aln.fna \
> vNT_crDNA-proto_aln_PAM.fna
## checking output
!cd $blastDir; echo; \
head vNT_crDNA-proto_aln_PAM.fna
In [133]:
!cd $blastDir; \
egrep -v "^>" vNT_crDNA-proto_aln_PAM.fna | \
sort | uniq -c | sort -k 1 -n -r
And if we reverse the sequence...
In [134]:
!cd $blastDir; \
egrep -v "^>" vNT_crDNA-proto_aln_PAM.fna | \
rev | sort | uniq -c | sort -k 1 -n -r
We get the (less prevalent) PAM region ('TGT') for known I-E CRISPR systems as shown in Rorek et al., 2013!
In [136]:
!cd $blastDir; \
CLdb -- arrayBlast -- get_SEEDstats \
--SEED -8 -1 \
-prefix vNT_crDNA-proto_aln \
< vNT_crDNA-proto_aln.fna
## checking output files
!cd $blastDir; echo; \
ls -thlc vNT_crDNA-proto_aln_SEEDstats*.txt
In [140]:
# checking files
!cd $blastDir; \
head -n 6 vNT_crDNA-proto_aln_SEEDstats-byPos.txt
print '-' * 50
!cd $blastDir; \
head -n 6 vNT_crDNA-proto_aln_SEEDstats-summary.txt
In [141]:
%%R
library(ggplot2)
library(gridExtra)
In [142]:
%%R -i blastDir -w 800
# loading table
infile = file.path(blastDir, 'vNT_crDNA-proto_aln_SEEDstats-byPos.txt')
tbl = read.delim(infile)
tbl = tbl[tbl$region != 'protospacer', ]
# mismatches and counts, & normalized mismatches
p.mis = ggplot(tbl, aes(pos_rel_SEED, mismatches, group=region, fill=region)) +
geom_bar(stat='identity', position='dodge') +
labs(y='Number\nof mismatches', x='Alignment position relative to SEED start')
p.count = ggplot(tbl, aes(pos_rel_SEED, count, group=region, fill=region)) +
geom_bar(stat='identity', position='dodge') +
labs(y='Number of alignments\nwith nucleotide\nat position', x='Alignment position relative to SEED start')
p.norm = ggplot(tbl, aes(pos_rel_SEED, mismatch_norm, group=region, fill=region)) +
geom_bar(stat='identity', position='dodge') +
labs(y='Mismatchs /\nalignments', x='Alignment position relative to SEED start')
grid.arrange(p.count, p.mis, p.norm, ncol=1)
Notes on plots:
In [ ]: