BioPython is a set of Python libraries for use in bioinformatics. The project has an excellent tutorial.
You can install BioPython into anaconda with the command:
conda install biopython
One of the most commonly used parts of BioPython is the Bio.SeqIO module, used for reading and writing sequences. This has its own little documentation page.
In [1]:
import Bio.SeqIO
Let's get some data to work with:
cd ~/Documents/python
mkdir data
cd data
wget -c 'http://j.mp/tbfasta'
wget -c 'http://j.mp/tbgbk'
wget -c 'http://j.mp/tbgff'
This fetches NC_000962.fna, NC_000962.gbk and NC_000962.gff which we can use in addition to the existing python.fasta (retrieved from ftp://ftp.sanbi.ac.za/python.fasta).
In [2]:
import Bio.SeqIO
filename = 'python.fasta'
for seq in Bio.SeqIO.parse(filename, 'fasta'):
print seq.id
In [32]:
filename = 'python.fasta'
parser = Bio.SeqIO.parse(filename, 'fasta')
print parser
sequence = parser.next()
print type(sequence)
print type(sequence.seq)
#print sequence.seq
nuc_sequence = sequence.seq
first_100_bases = nuc_sequence[:100]
print "first 100 bases:", first_100_bases
print nuc_sequence.alphabet
print sequence.id
In [33]:
print "100 bases:\t\t", first_100_bases
print "100 bases (compl)\t", first_100_bases.complement()
print "100 bases (r_compl)\t", first_100_bases.reverse_complement()
print "100 bases (as RNA):\t", first_100_bases.transcribe()
codons = first_100_bases[:90]
print "first 90 bases as AAs:", codons.translate()
print "first 90 bases as AAs:", codons.translate(table="Vertebrate Mitochondrial")
In [36]:
first_90 = nuc_sequence[:90]
print first_90.alphabet
rna_90 = first_90.transcribe()
print rna_90.alphabet
aa_30 = rna_90.translate()
print aa_30.alphabet
In [37]:
filename = 'python.fasta'
parser = Bio.SeqIO.parse(filename, 'fasta')
sequence = parser.next()
# SeqRecord attributes
print "sequence id:", sequence.id
print "sequence display id:", sequence.name
print "sequence description:", sequence.description
In [45]:
filename = 'data/NC_000962.gbk'
sequence = Bio.SeqIO.read(filename, 'genbank')
print "sequence id:", sequence.id
print "sequence display id:", sequence.name
print "sequence description:", sequence.description
print "sequence annotations:", type(sequence.annotations)
for annotation_key in sequence.annotations:
print annotation_key
print "taxonomy:", sequence.annotations['taxonomy']
print "references:", sequence.annotations['references']
print "organism:", sequence.annotations['organism']
In [39]:
filename = 'python.fasta'
# .read() doesn't work with files containing more than 1 record
sequence = Bio.SeqIO.read(filename, 'fasta')
In [52]:
filename = 'data/NC_000962.gbk'
sequence = Bio.SeqIO.read(filename, 'genbank')
first_feature = sequence.features[0]
print first_feature
first_feature_location = first_feature.location
print "start:",first_feature_location.start
print "end:", first_feature_location.end
print "strand:", first_feature_location.strand
In [55]:
filename = 'data/NC_000962.gbk'
sequence = Bio.SeqIO.read(filename, 'genbank')
gene_feature = None
for feature in sequence.features:
if feature.type == 'gene':
gene_feature = feature
break
print gene_feature.location.start
print gene_feature.location.end
print gene_feature
start = gene_feature.location.start
end = gene_feature.location.end
# get just the raw sequence
gene_nucleotides = sequence.seq[start:end]
# get part of the sequence record
gene_sequence = sequence[start:end]
print len(gene_sequence)
print type(gene_sequence)
print gene_sequence
In [58]:
aa_seq = gene_sequence.seq.translate()
print aa_seq
In [61]:
print "original sequence feature count:", len(sequence.features)
print "dnaA region feature count:", len(gene_sequence.features)
In [ ]: