In [1]:
%pylab inline
%config InlineBackend.figure_format = 'retina'
This notebook rebuilds the co-phylogeny found in Host defense reinforces host-parasite cospeciation, by Dale H. Clayton, Sarah E. Bush, Brad M. Goates, and Kevin P. Johnson.
Clayton, Dale H., et al. "Host defense reinforces host–parasite cospeciation." Proceedings of the National Academy of Sciences 100.26 (2003): 15694-15699.
The original co-phylogeny appears in the paper as follows :
Python packages (available through pip) :
biopython : talking to NCBI, manipulating file formatspandas : keeping our data organizedete3 : visualization for phylogenetic treesSystem packages :
The original study used cytochrome oxidase subunit I (COI), cytochrome b
(CYTB) and beta fibrinogen (FGB). The deposited FGB sequences for some of
the doves look weird in the alignment and result in a tree with
nonsensically long branches for those organisms. So, I removed FGB and added
ND2, IRBP, MB, NTF3 and 12S. This mostly recapitulates the published phylogeny
using open source software (RAxML) and GTR+GAMMA instead of
GTR+I+GAMMA
In [2]:
from Bio import Entrez
Entrez.email = "ryneches@ucdavis.edu"
terms = { 'COI' : [ 'COI[gene] 350:450[slen]', 'cytochrome oxidase subunit I 350:450[slen]' ],
'CYTB' : [ 'cytb[gene] 800:1200[slen]', 'cytochrome b 800:1200[slen]' ],
'ND2' : [ 'ND2[gene] 980:1100[slen]', 'NADH dehydrogenase subunit 2 980:1100[slen]' ],
'IRBP' : [ 'IRBP[gene] 600:1000[slen]' ],
'RAG1' : [ 'RAG1[gene] 2000:3000[slen]', 'RAG-1[gene] 2000:3000[slen]' ],
# 'FGB' : [ 'FGB[gene] 800:1300[slen]', 'beta fibrinogen 800:1300[slen]',
# 'fibrinogen beta 800:1300[slen]', 'beta-fibrinogen 450:1300[slen]' ],
'MB' : [ 'MB[gene] 700:900[slen]', 'myoglobin 700:900[slen]' ],
'NGF' : [ 'NGF[gene] 600:1000[slen]', 'nerve growth factor beta polypeptide 600:1000[slen]' ],
'NTF3' : [ 'NTF3[gene] 600:1000[slen]', 'neurotrophin 3 600:1000[slen]' ],
'12S' : [ '12S ribosomal 300:500[slen]' ]
}
def find_txid( species ) :
handle = Entrez.esearch(db='taxonomy', term=species)
record = Entrez.read(handle)
if int(record['Count']) == 0 :
raise Exception( 'no records found' )
txid = record['IdList'][0]
return txid
def find_gene( txid, gene ) :
taxid_query = 'txid' + str(txid) + '[Orgn] '
query = ' or '.join( [ taxid_query + s for s in terms[gene] ] )
#print query
handle = Entrez.esearch(db='nucleotide', term=query )
record = Entrez.read(handle)
if int(record['Count']) == 0 :
raise Exception( 'no genes found' )
ntid = record['IdList'][0]
fasta = Entrez.efetch( db='nucleotide', id=ntid, rettype='fasta', retmode='text' )
return fasta.read()
In [3]:
birdnames = """Columba guinea
Columba speciosa
Metriopelia ceciliae
Leptotila jamaicensis
Leptotila plumbeiceps
Leptotila verreauxi
Leptotila rufaxilla
Phapitreron leucotis
Ptilinopus occipitalis
Streptopelia capicola
Streptopelia senegalensis
Streptopelia decaocto
Patagioenas fasciata
Columba subvinacea
Claravis pretiosa
Columba livia
Columba plumbea
Columbina inca
Columbina passerina
Geotrygon montana
Phapitreron amethystina
Zenaida asiatica
Zenaida galapagoensis
Zenaida macroura
Oena capensis"""
birdnames = sorted(birdnames.split('\n'))
In [4]:
birdids = []
for name in birdnames :
bid = find_txid( name )
birdids.append(bid)
print name, bid
bird_nametoid = zip( birdnames, birdids )
In [5]:
bird_genes = {}
for species, txid in bird_nametoid :
bird_genes[species] = {}
for genename in terms.keys() :
try :
bird_genes[species][genename] = find_gene( txid, genename )
except :
#bird_genes[species][genename] = ''
continue
print species, ':', ' '.join( sorted(bird_genes[species].keys()) )
In [6]:
import pandas
from Bio.SeqIO import parse
from Bio.SeqRecord import SeqRecord
from StringIO import StringIO
def str2seq( s ) :
if type( s ) == float :
return SeqRecord( '' )
return parse( StringIO( s ), 'fasta' ).next()
bird_genetable = pandas.DataFrame( bird_genes ).T
#bird_genetable.applymap( lambda x : str2seq( x ).description ).T
bird_genetable.applymap( lambda x : len(str2seq( x )) )
Out[6]:
In [7]:
def rename_fasta( s ) :
if type( s ) == float :
return SeqRecord( '' )
fasta = parse( StringIO( s ), 'fasta' ).next()
description = fasta.description.replace('PREDICTED:', '')
fasta.id = '_'.join( description.split()[1:3] )
fasta.description = ''
return fasta.format('fasta')
g = bird_genetable['NGF'].dropna()
g.apply( rename_fasta )
Out[7]:
In [8]:
for gene in bird_genetable.columns :
f = open( 'bird_' + gene + '.fasta', 'w' )
g = bird_genetable[gene].dropna()
g = g.apply( rename_fasta )
for record in g :
f.write( record )
f.close()
In [9]:
!clustalo -v --threads 4 -i bird_12S.fasta -o bird_12S_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_COI.fasta -o bird_COI_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_CYTB.fasta -o bird_CYTB_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_FGB.fasta -o bird_FGB_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_IRBP.fasta -o bird_IRBP_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_MB.fasta -o bird_MB_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_ND2.fasta -o bird_ND2_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_NGF.fasta -o bird_NGF_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_NTF3.fasta -o bird_NTF3_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_RAG1.fasta -o bird_RAG1_aln.fasta
In [10]:
from Bio import AlignIO
from Bio.Nexus import Nexus
from Bio import Alphabet
from glob import glob
nexi = [ (alnfile, Nexus.Nexus( AlignIO.read( alnfile, 'fasta', alphabet=Alphabet.generic_dna ).format('nexus') ) )
for alnfile in glob( 'bird*_aln.fasta' ) ]
combined = Nexus.combine( nexi )
f = open('bird.nex', 'w')
combined.write_nexus_data(f)
f.close()
# RAxML needs a phylip file
combined.export_phylip('bird.phylip')
Out[10]:
In [11]:
# RAxML partition file
f = open( 'bird.txt', 'w' )
for key in combined.charsets.keys() :
s = 'DNA, ' + key.split('_')[1] + ' = ' + str(combined.charsets[key][0]+1) + '-' + str(combined.charsets[key][-1]+1)
f.write( s + '\n' )
print s
f.close()
In [12]:
#!raxmlHPC -m MULTICAT -q bird.txt -s bird.phylip -n bird -p 10001
!raxmlHPC -m GTRGAMMA -q bird.txt -s bird.phylip -n bird -p 10001
In [13]:
from ete3 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace
ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 1000
ts.show_leaf_name = False
ts.show_scale = False
nstyle = NodeStyle()
nstyle['size'] = 0
ete_tree = Tree( 'RAxML_bestTree.bird' )
ete_tree.set_outgroup('Claravis_pretiosa')
for node in ete_tree.traverse() :
node.set_style(nstyle)
if node.is_leaf :
tf = TextFace( node.name.replace('_',' ').replace('\'','') )
tf.fsize = 10
tf.hz_align = 100
node.add_face( tf, 0 )
ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)
Out[13]:
In [25]:
louse_records = []
handle = Entrez.esearch(db='nucleotide', term='AF278608:AF278643[accn]' )
record = Entrez.read(handle)
for ntid in record['IdList'] :
fasta = Entrez.efetch( db='nucleotide', id=ntid, rettype='fasta', retmode='text' ).read()
louse_dna = parse( StringIO( fasta ), 'fasta' ).next()
print louse_dna.description
louse_records.append( louse_dna )
handle = Entrez.esearch(db='nucleotide', term='AF190409:AF190426[accn]' )
record = Entrez.read(handle)
for ntid in record['IdList'] :
fasta = Entrez.efetch( db='nucleotide', id=ntid, rettype='fasta', retmode='text' ).read()
louse_dna = parse( StringIO( fasta ), 'fasta' ).next()
print louse_dna.description
louse_records.append( louse_dna )
AF279704:AF279743
AF278608:AF278643
AY151003:AY151026
In [66]:
louse_gene_names = { '12S' : '12S ribosomal RNA',
'COI' : 'cytochrome oxidase subunit I',
'EF1' : 'elongation factor' }
louse_genes = {}
for record in louse_records :
tokens = record.description.split()
name = [ '_'.join(tokens[1:3]) ]
if tokens.__contains__( 'haplotype' ) :
i = tokens.index('haplotype')
haplotype = '_'.join( tokens[i:i+2] )
name.append( haplotype )
name = '_'.join( name )
if not name in louse_genes :
louse_genes[name] = {}
gene_name = None
for key in louse_gene_names.keys() :
if record.description.__contains__( louse_gene_names[key] ) :
louse_genes[name][key] = record.format('fasta')
louse_genetable = pandas.DataFrame( louse_genes ).T
louse_genetable.applymap( lambda x : len( str2seq(x) ) )
Out[66]:
In [67]:
# not sure why Erwinia herbicola is in there...
louse_genetable = louse_genetable.drop('Erwinia_herbicola')
In [76]:
for gene in louse_genetable.columns :
f = open( 'louse_' + gene + '.fasta', 'w' )
g = louse_genetable[gene].dropna()
for name, record in g.iteritems() :
record = parse( StringIO( record ), 'fasta' ).next()
record.id = name
record.description = ''
record.name = ''
f.write( record.format('fasta') )
f.close()
In [75]:
!clustalo -v --threads 4 -i louse_12S.fasta -o louse_12S_aln.fasta
!echo
!clustalo -v --threads 4 -i louse_COI.fasta -o louse_COI_aln.fasta
!echo
!clustalo -v --threads 4 -i louse_EF1.fasta -o louse_EF1_aln.fasta
!echo
In [77]:
from Bio import AlignIO
from Bio.Nexus import Nexus
from Bio import Alphabet
from glob import glob
nexi = [ (alnfile, Nexus.Nexus( AlignIO.read( alnfile, 'fasta', alphabet=Alphabet.generic_dna ).format('nexus') ) )
for alnfile in glob( 'louse*_aln.fasta' ) ]
combined = Nexus.combine( nexi )
f = open('louse.nex', 'w')
combined.write_nexus_data(f)
f.close()
# RAxML needs a phylip file
combined.export_phylip('louse.phylip')
Out[77]:
In [78]:
# RAxML partition file
f = open( 'louse.txt', 'w' )
for key in combined.charsets.keys() :
s = 'DNA, ' + key.split('_')[1] + ' = ' + str(combined.charsets[key][0]+1) + '-' + str(combined.charsets[key][-1]+1)
f.write( s + '\n' )
print s
f.close()
In [79]:
!raxmlHPC -m GTRGAMMA -q louse.txt -s louse.phylip -n louse -p 10001
In [80]:
from ete3 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace
ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 1000
ts.show_leaf_name = False
ts.show_scale = False
nstyle = NodeStyle()
nstyle['size'] = 0
ete_tree = Tree( 'RAxML_bestTree.louse' )
#ete_tree.set_outgroup('Claravis_pretiosa')
for node in ete_tree.traverse() :
node.set_style(nstyle)
if node.is_leaf :
tf = TextFace( node.name.replace('_',' ').replace('\'','') )
tf.fsize = 10
tf.hz_align = 100
node.add_face( tf, 0 )
ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)
Out[80]:
In [87]:
#map( lambda x : x.replace(' ','_'), bird_genetable.index )
list( louse_genetable.index )
Out[87]:
In [ ]:
{'Claravis_pretiosa' : ['Columbicola_passerinae_haplotype_2'],
'Columba_guinea' : ['Columbicola_columbae_haplotype_2'],
'Columba_livia' : ['Columbicola_columbae_haplotype_1'],
'Columba_plumbea' : ['Columbicola_adamsi'],
'Columba_speciosa' : ['Columbicola_adamsi'],
'Columba_subvinacea' : [],
'Columbina_inca' : ['Columbicola_passerinae_haplotype_1'],
'Columbina_passerina' : ['Columbicola_passerinae_haplotype_1'],
'Geotrygon_montana' : [],
'Leptotila_jamaicensis' : [],
'Leptotila_plumbeiceps' : [],
'Leptotila_rufaxilla' : [],
'Leptotila_verreauxi' : [],
'Metriopelia_ceciliae' : [],
'Oena_capensis' : [],
'Patagioenas_fasciata' : [],
'Phapitreron_amethystina' : [],
'Phapitreron_leucotis' : [],
'Ptilinopus_occipitalis' : [],
'Streptopelia_capicola' : [],
'Streptopelia_decaocto' : [],
'Streptopelia_senegalensis' : [],
'Zenaida_asiatica' : [],
'Zenaida_galapagoensis' : [],
'Zenaida_macroura' : [] }