1. Data acquisition

Pfam database


In [ ]:
# download and extract pfam database
! wget -P ../database/pfam ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam30.0/Pfam-A.hmm.gz
! gzip -dvf ../database/pfam/Pfam-A.hmm.gz

Protein sequences


In [ ]:
# 50 cancer protein sequences from UniProtKB
small_fasta = '../database/sequence/small.fasta'

# 100 cancer protein sequences from UniProtKB
medium_fasta = '../database/sequence/medium.fasta'

# 250 cancer protein sequences from UniProtKB
large_fasta = '../database/sequence/large.fasta'

2. Data preparation

Pfam database


In [ ]:
# hmmpress
! hmmpress ../database/pfam/Pfam-A.hmm

Sequence database


In [1]:
# shell script
! bash prepare_seqdb.sh ../database/sequence/small.fasta ../input/small.fasta
! bash prepare_seqdb.sh ../database/sequence/medium.fasta ../input/medium.fasta
! bash prepare_seqdb.sh ../database/sequence/large.fasta ../input/large.fasta

3. Data analysis


In [41]:
import datetime
a = datetime.datetime.now()

# convert sequence database to Spark RDD
SEQ_DB = "file:///home/cloudera/cantaloupe/input/large.fasta"
data = sc.textFile(SEQ_DB)

# pipe Spark RDD to hmmscan
scriptPath = "/home/cloudera/cantaloupe/src/hmmer.sh"
pipeRDD = data.pipe(scriptPath)

# collect result
pipeRDD.collect()

b = datetime.datetime.now()
print(b-a)


0:06:21.835106

4. Report


In [ ]:
! cat ../output/*.txt