This notebook describes the setup of CLdb with a set of E. coli genomes.
Notes
In [287]:
# path to raw files
## CHANGE THIS!
rawFileDir = "~/perl/projects/CLdb/data/Ecoli/"
# directory where the CLdb database will be created
## CHANGE THIS!
workDir = "~/t/CLdb_Ecoli/"
In [288]:
# viewing file links
import os
import zipfile
import csv
from IPython.display import FileLinks
# pretty viewing of tables
## get from: http://epmoyer.github.io/ipy_table/
from ipy_table import *
In [290]:
rawFileDir = os.path.expanduser(rawFileDir)
workDir = os.path.expanduser(workDir)
The required files are in '../ecoli_raw/':
Let's look at the provided files for this example:
In [151]:
FileLinks(rawFileDir)
Out[151]:
In [152]:
!CLdb -h
In [153]:
# this makes the working directory
if not os.path.isdir(workDir):
os.makedirs(workDir)
In [154]:
# unarchiving files in the raw folder over to the newly made working folder
files = ['array.zip','loci.zip', 'GIs.txt.zip']
files = [os.path.join(rawFileDir, x) for x in files]
for f in files:
if not os.path.isfile(f):
raise IOError, 'Cannot find file: {}'.format(f)
else:
zip = zipfile.ZipFile(f)
zip.extractall(path=workDir)
print 'unzipped raw files:'
FileLinks(workDir)
Out[154]:
In [155]:
# making genbank directory
genbankDir = os.path.join(workDir, 'genbank')
if not os.path.isdir(genbankDir):
os.makedirs(genbankDir)
# downloading genomes
!cd $genbankDir; \
CLdb -- accession-GI2fastaGenome -format genbank -fork 5 < ../GIs.txt
# checking files
!cd $genbankDir; \
ls -thlc *.gbk
In [271]:
!CLdb -- makeDB -h
In [272]:
!cd $workDir; \
CLdb -- makeDB -r -drop
CLdbFile = os.path.join(workDir, 'CLdb.sqlite')
print 'CLdb file location: {}'.format(CLdbFile)
In [273]:
s = 'DATABASE = ' + CLdbFile
configFile = os.path.join(os.path.expanduser('~'), '.CLdb')
with open(configFile, 'wb') as outFH:
outFH.write(s)
print 'Config file written: {}'.format(configFile)
In [274]:
lociFile = os.path.join(workDir, 'loci', 'loci.txt')
# reading in file
tbl = []
with open(lociFile, 'rb') as f:
reader = csv.reader(f, delimiter='\t')
for row in reader:
tbl.append(row)
# making table
make_table(tbl)
apply_theme('basic')
Out[274]:
Notes on the loci table:
CLdb -- loadLoci -h
.
In [275]:
!CLdb -- loadLoci -h
In [276]:
!CLdb -- loadLoci < $lociFile
Notes on loading
Notes on the command
-database
flag for CLdb -- loadLoci
???-database
flag because it is provided via the .CLdb config file that was previously created.
In [277]:
# This is just a quick summary of the database
## It should show 10 loci for the 'loci' rows
!CLdb -- summary
The summary doesn't show anything for spacers, DRs, genes or leaders!
That's because we haven't loaded that info yet...
In [278]:
# an example array file (obtained from CRISPRFinder)
arrayFile = os.path.join(workDir, 'array', 'Ecoli_0157_H7_a1.txt')
!head $arrayFile
Note: the array file consists of 4 columns:
All extra columns ignored!
In [279]:
# loading CRISPR array info
!CLdb -- loadArrays
In [280]:
# This is just a quick summary of the database
!CLdb -- summary
Note: The output should show 75 spacer & 85 DR entries in the database
In [281]:
geneDir = os.path.join(workDir, 'genes')
if not os.path.isdir(geneDir):
os.makedirs(geneDir)
In [282]:
!cd $geneDir; \
CLdb -- getGenesInLoci 2> CAS.log > CAS.txt
# checking output
!cd $geneDir; \
head -n 5 CAS.log; \
echo -----------; \
tail -n 5 CAS.log; \
echo -----------; \
head -n 5 CAS.txt
In [283]:
# loading gene table into the database
!cd $geneDir; \
CLdb -- loadGenes < CAS.txt
In [284]:
!CLdb -- setSenseStrand
In [285]:
!CLdb -- clusterArrayElements -s -r
In [286]:
!CLdb -- summary -name -subtype
In [ ]: