BioPython is a set of Python libraries for use in bioinformatics. The project has an excellent tutorial.

You can install BioPython into anaconda with the command:

    conda install biopython

One of the most commonly used parts of BioPython is the Bio.SeqIO module, used for reading and writing sequences. This has its own little documentation page.


In [1]:
import Bio.SeqIO

Let's get some data to work with:

    cd ~/Documents/python
    mkdir data
    cd data
    wget -c 'http://j.mp/tbfasta'
    wget -c 'http://j.mp/tbgbk'
    wget -c 'http://j.mp/tbgff'

This fetches NC_000962.fna, NC_000962.gbk and NC_000962.gff which we can use in addition to the existing python.fasta (retrieved from ftp://ftp.sanbi.ac.za/python.fasta).


In [2]:
import Bio.SeqIO

filename = 'python.fasta'
for seq in Bio.SeqIO.parse(filename, 'fasta'):
    print seq.id


gi|2765658|emb|Z78533.1|CIZ78533
gi|2765657|emb|Z78532.1|CCZ78532
gi|2765656|emb|Z78531.1|CFZ78531
gi|2765655|emb|Z78530.1|CMZ78530
gi|2765654|emb|Z78529.1|CLZ78529
gi|2765652|emb|Z78527.1|CYZ78527
gi|2765651|emb|Z78526.1|CGZ78526
gi|2765650|emb|Z78525.1|CAZ78525
gi|2765649|emb|Z78524.1|CFZ78524
gi|2765648|emb|Z78523.1|CHZ78523
gi|2765647|emb|Z78522.1|CMZ78522
gi|2765646|emb|Z78521.1|CCZ78521
gi|2765645|emb|Z78520.1|CSZ78520
gi|2765644|emb|Z78519.1|CPZ78519
gi|2765643|emb|Z78518.1|CRZ78518
gi|2765642|emb|Z78517.1|CFZ78517
gi|2765641|emb|Z78516.1|CPZ78516
gi|2765640|emb|Z78515.1|MXZ78515
gi|2765639|emb|Z78514.1|PSZ78514
gi|2765638|emb|Z78513.1|PBZ78513
gi|2765637|emb|Z78512.1|PWZ78512
gi|2765636|emb|Z78511.1|PEZ78511
gi|2765635|emb|Z78510.1|PCZ78510
gi|2765634|emb|Z78509.1|PPZ78509
gi|2765633|emb|Z78508.1|PLZ78508
gi|2765632|emb|Z78507.1|PLZ78507
gi|2765631|emb|Z78506.1|PLZ78506
gi|2765630|emb|Z78505.1|PSZ78505
gi|2765629|emb|Z78504.1|PKZ78504
gi|2765628|emb|Z78503.1|PCZ78503
gi|2765627|emb|Z78502.1|PBZ78502
gi|2765626|emb|Z78501.1|PCZ78501
gi|2765625|emb|Z78500.1|PWZ78500
gi|2765624|emb|Z78499.1|PMZ78499
gi|2765623|emb|Z78498.1|PMZ78498
gi|2765622|emb|Z78497.1|PDZ78497
gi|2765621|emb|Z78496.1|PAZ78496
gi|2765620|emb|Z78495.1|PEZ78495
gi|2765619|emb|Z78494.1|PNZ78494
gi|2765618|emb|Z78493.1|PGZ78493
gi|2765617|emb|Z78492.1|PBZ78492
gi|2765616|emb|Z78491.1|PCZ78491
gi|2765615|emb|Z78490.1|PFZ78490
gi|2765614|emb|Z78489.1|PDZ78489
gi|2765613|emb|Z78488.1|PTZ78488
gi|2765612|emb|Z78487.1|PHZ78487
gi|2765611|emb|Z78486.1|PBZ78486
gi|2765610|emb|Z78485.1|PHZ78485
gi|2765609|emb|Z78484.1|PCZ78484
gi|2765608|emb|Z78483.1|PVZ78483
gi|2765607|emb|Z78482.1|PEZ78482
gi|2765606|emb|Z78481.1|PIZ78481
gi|2765605|emb|Z78480.1|PGZ78480
gi|2765604|emb|Z78479.1|PPZ78479
gi|2765603|emb|Z78478.1|PVZ78478
gi|2765602|emb|Z78477.1|PVZ78477
gi|2765601|emb|Z78476.1|PGZ78476
gi|2765600|emb|Z78475.1|PSZ78475
gi|2765599|emb|Z78474.1|PKZ78474
gi|2765598|emb|Z78473.1|PSZ78473
gi|2765597|emb|Z78472.1|PLZ78472
gi|2765596|emb|Z78471.1|PDZ78471
gi|2765595|emb|Z78470.1|PPZ78470
gi|2765594|emb|Z78469.1|PHZ78469
gi|2765593|emb|Z78468.1|PAZ78468
gi|2765592|emb|Z78467.1|PSZ78467
gi|2765591|emb|Z78466.1|PPZ78466
gi|2765590|emb|Z78465.1|PRZ78465
gi|2765589|emb|Z78464.1|PGZ78464
gi|2765588|emb|Z78463.1|PGZ78463
gi|2765587|emb|Z78462.1|PSZ78462
gi|2765586|emb|Z78461.1|PWZ78461
gi|2765585|emb|Z78460.1|PCZ78460
gi|2765584|emb|Z78459.1|PDZ78459
gi|2765583|emb|Z78458.1|PHZ78458
gi|2765582|emb|Z78457.1|PCZ78457
gi|2765581|emb|Z78456.1|PTZ78456
gi|2765580|emb|Z78455.1|PJZ78455
gi|2765579|emb|Z78454.1|PFZ78454
gi|2765578|emb|Z78453.1|PSZ78453
gi|2765577|emb|Z78452.1|PBZ78452
gi|2765576|emb|Z78451.1|PHZ78451
gi|2765575|emb|Z78450.1|PPZ78450
gi|2765574|emb|Z78449.1|PMZ78449
gi|2765573|emb|Z78448.1|PAZ78448
gi|2765572|emb|Z78447.1|PVZ78447
gi|2765571|emb|Z78446.1|PAZ78446
gi|2765570|emb|Z78445.1|PUZ78445
gi|2765569|emb|Z78444.1|PAZ78444
gi|2765568|emb|Z78443.1|PLZ78443
gi|2765567|emb|Z78442.1|PBZ78442
gi|2765566|emb|Z78441.1|PSZ78441
gi|2765565|emb|Z78440.1|PPZ78440
gi|2765564|emb|Z78439.1|PBZ78439

In [32]:
filename = 'python.fasta'
parser = Bio.SeqIO.parse(filename, 'fasta')
print parser
sequence = parser.next()
print type(sequence)
print type(sequence.seq)
#print sequence.seq
nuc_sequence = sequence.seq
first_100_bases = nuc_sequence[:100]
print "first 100 bases:", first_100_bases
print nuc_sequence.alphabet
print sequence.id


<generator object parse at 0x7f0036aa5cd0>
<class 'Bio.SeqRecord.SeqRecord'>
<class 'Bio.Seq.Seq'>
first 100 bases: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACC
SingleLetterAlphabet()
gi|2765658|emb|Z78533.1|CIZ78533

In [33]:
print "100 bases:\t\t", first_100_bases
print "100 bases (compl)\t", first_100_bases.complement()
print "100 bases (r_compl)\t", first_100_bases.reverse_complement()
print "100 bases (as RNA):\t", first_100_bases.transcribe()
codons = first_100_bases[:90]
print "first 90 bases as AAs:", codons.translate()
print "first 90 bases as AAs:", codons.translate(table="Vertebrate Mitochondrial")


100 bases:		CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACC
100 bases (compl)	GCATTGTTCCAAAGGCATCCACTTGGACGCCTTCCTAGTAACTACTCTGGCACCTTATTTGCTAGCTCACTTAGGCCTCCTGGCCACATGAGTCGAGTGG
100 bases (r_compl)	GGTGAGCTGAGTACACCGGTCCTCCGGATTCACTCGATCGTTTATTCCACGGTCTCATCAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACG
100 bases (as RNA):	CGUAACAAGGUUUCCGUAGGUGAACCUGCGGAAGGAUCAUUGAUGAGACCGUGGAAUAAACGAUCGAGUGAAUCCGGAGGACCGGUGUACUCAGCUCACC
first 90 bases as AAs: RNKVSVGEPAEGSLMRPWNKRSSESGGPVY
first 90 bases as AAs: RNKVSVGEPAEGSLM*PWNKRSSESGGPVY

In [36]:
first_90 = nuc_sequence[:90]
print first_90.alphabet
rna_90 = first_90.transcribe()
print rna_90.alphabet
aa_30 = rna_90.translate()
print aa_30.alphabet


SingleLetterAlphabet()
RNAAlphabet()
ExtendedIUPACProtein()

In [37]:
filename = 'python.fasta'
parser = Bio.SeqIO.parse(filename, 'fasta')
sequence = parser.next()
# SeqRecord attributes
print "sequence id:", sequence.id
print "sequence display id:", sequence.name
print "sequence description:", sequence.description


sequence id: gi|2765658|emb|Z78533.1|CIZ78533
sequence display id: gi|2765658|emb|Z78533.1|CIZ78533
sequence description: gi|2765658|emb|Z78533.1|CIZ78533 C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA

In [45]:
filename = 'data/NC_000962.gbk'
sequence = Bio.SeqIO.read(filename, 'genbank')
print "sequence id:", sequence.id
print "sequence display id:", sequence.name
print "sequence description:", sequence.description
print "sequence annotations:", type(sequence.annotations)
for annotation_key in sequence.annotations:
    print annotation_key
print "taxonomy:", sequence.annotations['taxonomy']
print "references:", sequence.annotations['references']
print "organism:", sequence.annotations['organism']


sequence id: NC_000962.3
sequence display id: NC_000962
sequence description: Mycobacterium tuberculosis H37Rv, complete genome.
sequence annotations: <type 'dict'>
comment
sequence_version
source
taxonomy
keywords
references
accessions
data_file_division
date
contig
organism
gi
taxonomy: ['Bacteria', 'Actinobacteria', 'Actinobacteridae', 'Actinomycetales', 'Corynebacterineae', 'Mycobacteriaceae', 'Mycobacterium', 'Mycobacterium tuberculosis complex']
references: [Reference(title='TubercuList--10 years after', ...), Reference(title='Re-annotation of the genome sequence of Mycobacterium tuberculosis H37Rv', ...), Reference(title='Deciphering the biology of Mycobacterium tuberculosis from the complete genome sequence', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
organism: Mycobacterium tuberculosis H37Rv

In [39]:
filename = 'python.fasta'
# .read() doesn't work with files containing more than 1 record
sequence = Bio.SeqIO.read(filename, 'fasta')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-39-5c6adcb3e2f3> in <module>()
      1 filename = 'python.fasta'
----> 2 sequence = Bio.SeqIO.read(filename, 'fasta')

/home/pvh/anaconda/lib/python2.7/site-packages/Bio/SeqIO/__init__.pyc in read(handle, format, alphabet)
    663         second = None
    664     if second is not None:
--> 665         raise ValueError("More than one record found in handle")
    666     return first
    667 

ValueError: More than one record found in handle

In [52]:
filename = 'data/NC_000962.gbk'
sequence = Bio.SeqIO.read(filename, 'genbank')
first_feature = sequence.features[0]
print first_feature
first_feature_location = first_feature.location
print "start:",first_feature_location.start
print "end:", first_feature_location.end
print "strand:", first_feature_location.strand


type: source
location: [0:4411532](+)
qualifiers:
    Key: db_xref, Value: ['taxon:83332']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Mycobacterium tuberculosis H37Rv']
    Key: strain, Value: ['H37Rv']

start: 0
end: 4411532
strand: 1

In [55]:
filename = 'data/NC_000962.gbk'
sequence = Bio.SeqIO.read(filename, 'genbank')
gene_feature = None
for feature in sequence.features:
    if feature.type == 'gene':
        gene_feature = feature
        break
print gene_feature.location.start
print gene_feature.location.end
print gene_feature
start = gene_feature.location.start
end = gene_feature.location.end
# get just the raw sequence
gene_nucleotides = sequence.seq[start:end]
# get part of the sequence record
gene_sequence = sequence[start:end]
print len(gene_sequence)
print type(gene_sequence)
print gene_sequence


0
1524
type: gene
location: [0:1524](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:885041']
    Key: experiment, Value: ['DESCRIPTION:Mutation analysis, gene expression[PMID: 10375628]']
    Key: gene, Value: ['dnaA']
    Key: locus_tag, Value: ['Rv0001']

1524
<class 'Bio.SeqRecord.SeqRecord'>
ID: NC_000962.3
Name: NC_000962
Description: Mycobacterium tuberculosis H37Rv, complete genome.
Number of features: 10
Seq('TTGACCGATGACCCCGGTTCAGGCTTCACCACAGTGTGGAACGCGGTCGTCTCC...TAG', IUPACAmbiguousDNA())

In [58]:
aa_seq = gene_sequence.seq.translate()
print aa_seq


LTDDPGSGFTTVWNAVVSELNGDPKVDDGPSSDANLSAPLTPQQRAWLNLVQPLTIVEGFALLSVPSSFVQNEIERHLRAPITDALSRRLGHQIQLGVRIAPPATDEADDTTVPPSENPATTSPDTTTDNDEIDDSAAARGDNQHSWPSYFTERPHNTDSATAGVTSLNRRYTFDTFVIGASNRFAHAAALAIAEAPARAYNPLFIWGESGLGKTHLLHAAGNYAQRLFPGMRVKYVSTEEFTNDFINSLRDDRKVAFKRSYRDVDVLLVDDIQFIEGKEGIQEEFFHTFNTLHNANKQIVISSDRPPKQLATLEDRLRTRFEWGLITDVQPPELETRIAILRKKAQMERLAVPDDVLELIASSIERNIRELEGALIRVTAFASLNKTPIDKALAEIVLRDLIADANTMQISAATIMAATAEYFDTTVEELRGPGKTRALAQSRQIAMYLCRELTDLSLPKIGQAFGRDHTTVMYAQRKILSEMAERREVFDHVKELTTRIRQRSKR*

In [61]:
print "original sequence feature count:", len(sequence.features)
print "dnaA region feature count:", len(gene_sequence.features)


original sequence feature count: 19344
dnaA region feature count: 10

In [ ]: