In [ ]:
    
# download and extract pfam database
! wget -P ../database/pfam ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam30.0/Pfam-A.hmm.gz
! gzip -dvf ../database/pfam/Pfam-A.hmm.gz
    
In [ ]:
    
# 50 cancer protein sequences from UniProtKB
small_fasta = '../database/sequence/small.fasta'
# 100 cancer protein sequences from UniProtKB
medium_fasta = '../database/sequence/medium.fasta'
# 250 cancer protein sequences from UniProtKB
large_fasta = '../database/sequence/large.fasta'
    
In [ ]:
    
# hmmpress
! hmmpress ../database/pfam/Pfam-A.hmm
    
In [1]:
    
# shell script
! bash prepare_seqdb.sh ../database/sequence/small.fasta ../input/small.fasta
! bash prepare_seqdb.sh ../database/sequence/medium.fasta ../input/medium.fasta
! bash prepare_seqdb.sh ../database/sequence/large.fasta ../input/large.fasta
    
In [41]:
    
import datetime
a = datetime.datetime.now()
# convert sequence database to Spark RDD
SEQ_DB = "file:///home/cloudera/cantaloupe/input/large.fasta"
data = sc.textFile(SEQ_DB)
# pipe Spark RDD to hmmscan
scriptPath = "/home/cloudera/cantaloupe/src/hmmer.sh"
pipeRDD = data.pipe(scriptPath)
# collect result
pipeRDD.collect()
b = datetime.datetime.now()
print(b-a)
    
    
In [ ]:
    
! cat ../output/*.txt