In [40]:
# directory where you want the spacer blasting to be done
## CHANGE THIS!
workDir = "/home/nyoungb2/t/CLdb_Methanosarcina/"
In [41]:
import os
from IPython.display import FileLinks
%load_ext rpy2.ipython
In [42]:
blastDir = os.path.join(workDir, 'arrayBlast')
if not os.path.isdir(blastDir):
os.makedirs(blastDir)
In [43]:
!cd $blastDir; \
CLdb -- array2fasta -cluster -cut 1 > spacers_cut1.fna
## getting number of spacer sequences
!printf 'Number of unique spacers:\t'
!cd $blastDir; \
grep -c ">" spacers_cut1.fna; \
echo; \
head -n 6 spacers_cut1.fna
In [44]:
!CLdb -- arrayBlast -h
In [45]:
# listing all arrayBlast subcommands
!CLdb -- arrayBlast --list
In [54]:
# spacer blast
## Really just a thin wrapper around blastn that will blast the spacers against all genomes in CLdb
### If you want to blast some else, just run the blast yourself (use 'perldoc CLdb_arrayBlast.pl' for more info)
!cd $blastDir; \
CLdb -- arrayBlast -- run \
-query spacers_cut1.fna \
-fork 10 \
> spacers_cut1_blastn.xml
# NOTE: using 10 CPUs
Note:
In [55]:
# converting .xml to .srl
## This retains all of the info in the xml, but produces a smaller file that is faster to load and write.
## For large blast runs (xml files > 100 Mb), this will take a little while
!cd $blastDir; \
CLdb -- arrayBlast -- xml2srl \
spacers_cut1_blastn.xml \
> spacers_cut1_blastn.srl
# checking output file
!cd $blastDir; \
ls -thlc spacers_cut1_blastn.*
Note:
CLdb -- arrayBlast -- run
directly into CLdb -- arrayBlast -- xml2srl
CLdb -- arrayBlast -- run -query spacers_cut1.fna | CLdb -- arrayBlast -- xml2srl
In [56]:
# selecting DRs, so we can blast them.
## This is needed blast filter out spacers that hit CRISPR arrays
!cd $blastDir; \
CLdb -- array2fasta \
-cluster -cut 1 -r \
> DRs_cut1.fna
## getting number of spacer sequences
!printf 'Number of unique DRs:\t'
!cd $blastDir; \
grep -c ">" DRs_cut1.fna; \
echo; \
head -n 6 DRs_cut1.fna
In [57]:
# DR blast, just like spacer blast
!cd $blastDir; \
CLdb -- arrayBlast -- run \
-query DRs_cut1.fna \
-fork 10 \
> DRs_cut1_blastn.xml
In [58]:
# converting .xml to .srl
## This retains all of the info in the xml, but produces a smaller file that is faster to load and write.
## For large blast runs (xml files > 100 Mb), this will take a little while
!cd $blastDir; \
CLdb -- arrayBlast -- xml2srl \
DRs_cut1_blastn.xml \
> DRs_cut1_blastn.srl
# checking output file
!cd $blastDir; \
ls -thlc DRs_cut1_blastn.*
In [59]:
# filtering spacer hits to arrays
## This is based on the premise that DR hits adjacent to spacer hits signify that the spacer is hitting a CRISPR array
!cd $blastDir; \
CLdb -- arrayBlast -- filter_arrayHits \
spacers_cut1_blastn.srl \
DRs_cut1_blastn.srl \
> spacers_cut1_blastn_filt.srl
# checking output file
!cd $blastDir; \
echo; \
ls -thlc spacers_cut1_blastn_filt.srl
In [63]:
!cd $blastDir; \
CLdb -- arrayBlast -- srl2csv \
spacers_cut1_blastn_filt.srl \
> spacers_cut1_blastn_filt.txt
# assessing table of blast hits
!cd $blastDir; \
head spacers_cut1_blastn_filt.txt
In [64]:
#!printf "Number of blast hits: "
nhits = !cd $blastDir; wc -l spacers_cut1_blastn_filt.txt
nhits = nhits[0].split(' ')[0]
print "Number of blast hits: {}".format(int(nhits) - 1)
In [65]:
!cd $blastDir; \
CLdb -- arrayBlast -- add_crRNA \
< spacers_cut1_blastn_filt.srl \
> spacers_cut1_blastn_filt_crDNA.srl
## checking output
!cd $blastDir; echo; \
ls -thlc spacers_cut1_blastn_filt_crDNA.srl
In [67]:
!cd $blastDir; \
CLdb -- arrayBlast -- add_proto -workers 24 \
< spacers_cut1_blastn_filt_crDNA.srl \
> spacers_cut1_blastn_filt_crDNA_proto.srl
## checking output
!cd $blastDir; echo; \
ls -thlc spacers_cut1_blastn_filt_crDNA_proto.srl
In [68]:
# aligning crRNA and protospacer
!cd $blastDir; \
CLdb -- arrayBlast -- align_proto \
< spacers_cut1_blastn_filt_crDNA_proto.srl \
> spacers_cut1_blastn_filt_crDNA_proto_aln.srl
## checking output
!cd $blastDir; echo; \
ls -thlc spacers_cut1_blastn_filt_crDNA_proto_aln.srl
CLdb -- arrayBlast --perldoc -- get_align
for info on what -outfmt
does
In [69]:
# getting alignment of protospacer and crDNA and writing as a fasta
!cd $blastDir; \
CLdb -- arrayBlast -- get_align \
-outfmt taxon_name,subtype,evalue \
< spacers_cut1_blastn_filt_crDNA_proto_aln.srl \
> blastn_crDNA-proto_aln.fna
## checking output
!cd $blastDir; echo; \
ls -thlc blastn_crDNA-proto_aln.fna
In [70]:
# let's take a look at the sequences
!cd $blastDir; \
head -n 10 blastn_crDNA-proto_aln.fna
Example:
>crDNA
ATAGACA
>proto
ATAGACA
Note: the lower case letters are sequence adjacent to the actual protospacer
If you wanted each crRNA-protospacer pair individually, you could use the split
bash command like this:
WARNING: this produces a lot of files, so we will make them in a new directory
In [71]:
# file/directory
inFile = os.path.join(blastDir, 'blastn_crDNA-proto_aln.fna')
outDir = os.path.join(blastDir, 'blastn_crDNA-proto_aln_split')
if not os.path.isdir(outDir):
os.makedirs(outDir)
outPre = os.path.join(outDir, 'split')
# spliting alignment
!split -l 4 -d -a 4 $inFile $outPre
# checking out files
!printf 'Number of files produced: '
!find $outDir -name "split*" | wc -l
In [72]:
!cd $blastDir; \
CLdb -- arrayBlast -- get_PAM -PAM 1 4 -f - \
< blastn_crDNA-proto_aln.fna \
> blastn_crDNA-proto_aln_PAM.fna
## checking output
!cd $blastDir; echo; \
head blastn_crDNA-proto_aln_PAM.fna
In [73]:
!cd $blastDir; \
egrep -v "^>" blastn_crDNA-proto_aln_PAM.fna | \
sort | uniq -c | sort -k 1 -n -r
In [74]:
!cd $blastDir; \
CLdb -- arrayBlast -- get_SEEDstats \
--SEED -8 -1 \
-prefix blastn_crDNA-proto_aln \
< blastn_crDNA-proto_aln.fna
## checking output files
!cd $blastDir; echo; \
ls -thlc blastn_crDNA-proto_aln_SEEDstats*.txt
In [75]:
# checking files
!cd $blastDir; \
head -n 6 blastn_crDNA-proto_aln_SEEDstats-byPos.txt
print '-' * 50
!cd $blastDir; \
head -n 6 blastn_crDNA-proto_aln_SEEDstats-summary.txt
In [76]:
%%R
library(ggplot2)
library(gridExtra)
In [77]:
%%R -i blastDir -w 800
# loading table
infile = file.path(blastDir, 'blastn_crDNA-proto_aln_SEEDstats-byPos.txt')
tbl = read.delim(infile)
tbl = tbl[tbl$region != 'protospacer', ]
# mismatches and counts, & normalized mismatches
p.mis = ggplot(tbl, aes(pos_rel_SEED, mismatches, group=region, fill=region)) +
geom_bar(stat='identity', position='dodge') +
labs(y='Number\nof mismatches', x='Alignment position relative to SEED start')
p.count = ggplot(tbl, aes(pos_rel_SEED, count, group=region, fill=region)) +
geom_bar(stat='identity', position='dodge') +
labs(y='Number of alignments\nwith nucleotide\nat position', x='Alignment position relative to SEED start')
p.norm = ggplot(tbl, aes(pos_rel_SEED, mismatch_norm, group=region, fill=region)) +
geom_bar(stat='identity', position='dodge') +
labs(y='Mismatchs /\nalignments', x='Alignment position relative to SEED start')
grid.arrange(p.count, p.mis, p.norm, ncol=1)
Notes on plots:
In [ ]: