In [ ]:
# download and extract pfam database
! wget -P ../database/pfam ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam30.0/Pfam-A.hmm.gz
! gzip -dvf ../database/pfam/Pfam-A.hmm.gz
In [ ]:
# 50 cancer protein sequences from UniProtKB
small_fasta = '../database/sequence/small.fasta'
# 100 cancer protein sequences from UniProtKB
medium_fasta = '../database/sequence/medium.fasta'
# 250 cancer protein sequences from UniProtKB
large_fasta = '../database/sequence/large.fasta'
In [ ]:
# hmmpress
! hmmpress ../database/pfam/Pfam-A.hmm
In [1]:
# shell script
! bash prepare_seqdb.sh ../database/sequence/small.fasta ../input/small.fasta
! bash prepare_seqdb.sh ../database/sequence/medium.fasta ../input/medium.fasta
! bash prepare_seqdb.sh ../database/sequence/large.fasta ../input/large.fasta
In [41]:
import datetime
a = datetime.datetime.now()
# convert sequence database to Spark RDD
SEQ_DB = "file:///home/cloudera/cantaloupe/input/large.fasta"
data = sc.textFile(SEQ_DB)
# pipe Spark RDD to hmmscan
scriptPath = "/home/cloudera/cantaloupe/src/hmmer.sh"
pipeRDD = data.pipe(scriptPath)
# collect result
pipeRDD.collect()
b = datetime.datetime.now()
print(b-a)
In [ ]:
! cat ../output/*.txt