In [1]:

    
%pylab inline
%config InlineBackend.figure_format = 'retina'









    



Populating the interactive namespace from numpy and matplotlib

Rebuilding the Dove-Louse co-phylogeny

This notebook rebuilds the co-phylogeny found in Host defense reinforces host-parasite cospeciation, by Dale H. Clayton, Sarah E. Bush, Brad M. Goates, and Kevin P. Johnson.

Clayton, Dale H., et al. "Host defense reinforces host–parasite cospeciation." Proceedings of the National Academy of Sciences 100.26 (2003): 15694-15699.

The original co-phylogeny appears in the paper as follows :

Required software

Python packages (available through pip) :

biopython : talking to NCBI, manipulating file formats
pandas : keeping our data organized
ete3 : visualization for phylogenetic trees

System packages :

clustalo : multiple sequence alignments
RAxML : phylogenetic tree inference

Bird tree

The original study used cytochrome oxidase subunit I (COI), cytochrome b (CYTB) and beta fibrinogen (FGB). The deposited FGB sequences for some of the doves look weird in the alignment and result in a tree with nonsensically long branches for those organisms. So, I removed FGB and added ND2, IRBP, MB, NTF3 and 12S. This mostly recapitulates the published phylogeny using open source software (RAxML) and GTR+GAMMA instead of GTR+I+GAMMA



In [2]:

    
from Bio import Entrez
Entrez.email = "ryneches@ucdavis.edu"

terms = { 'COI'  : [ 'COI[gene] 350:450[slen]', 'cytochrome oxidase subunit I 350:450[slen]' ],
          'CYTB' : [ 'cytb[gene] 800:1200[slen]', 'cytochrome b 800:1200[slen]' ],
          'ND2'  : [ 'ND2[gene] 980:1100[slen]', 'NADH dehydrogenase subunit 2 980:1100[slen]' ],
          'IRBP' : [ 'IRBP[gene] 600:1000[slen]' ],
          'RAG1' : [ 'RAG1[gene] 2000:3000[slen]', 'RAG-1[gene] 2000:3000[slen]' ],
#          'FGB'  : [ 'FGB[gene] 800:1300[slen]', 'beta fibrinogen 800:1300[slen]', 
#                     'fibrinogen beta 800:1300[slen]', 'beta-fibrinogen 450:1300[slen]' ],
          'MB'   : [ 'MB[gene] 700:900[slen]', 'myoglobin 700:900[slen]' ],
          'NGF'  : [ 'NGF[gene] 600:1000[slen]', 'nerve growth factor beta polypeptide 600:1000[slen]' ],
          'NTF3' : [ 'NTF3[gene] 600:1000[slen]', 'neurotrophin 3 600:1000[slen]' ],
          '12S'  : [ '12S ribosomal 300:500[slen]' ]
        }

def find_txid( species ) :
    handle = Entrez.esearch(db='taxonomy', term=species)
    record = Entrez.read(handle)
    if int(record['Count']) == 0 :
        raise Exception( 'no records found' )
    txid = record['IdList'][0]
    return txid

def find_gene( txid, gene ) :
    taxid_query = 'txid' + str(txid) + '[Orgn] '
    query = ' or '.join( [ taxid_query + s for s in terms[gene] ] )
    #print query
    handle = Entrez.esearch(db='nucleotide', term=query )
    record = Entrez.read(handle)
    if int(record['Count']) == 0 :
        raise Exception( 'no genes found' )
    
    ntid = record['IdList'][0]
    fasta = Entrez.efetch( db='nucleotide', id=ntid, rettype='fasta', retmode='text' )
    return fasta.read()



In [3]:

    
birdnames = """Columba guinea
Columba speciosa
Metriopelia ceciliae
Leptotila jamaicensis
Leptotila plumbeiceps
Leptotila verreauxi
Leptotila rufaxilla
Phapitreron leucotis
Ptilinopus occipitalis
Streptopelia capicola
Streptopelia senegalensis
Streptopelia decaocto
Patagioenas fasciata
Columba subvinacea
Claravis pretiosa
Columba livia
Columba plumbea
Columbina inca
Columbina passerina
Geotrygon montana
Phapitreron amethystina
Zenaida asiatica
Zenaida galapagoensis
Zenaida macroura
Oena capensis"""

birdnames = sorted(birdnames.split('\n'))



In [4]:

    
birdids = []
for name in birdnames :
    bid = find_txid( name )
    birdids.append(bid)
    print name, bid
bird_nametoid = zip( birdnames, birdids )









    



Claravis pretiosa 115590
Columba guinea 135627
Columba livia 8932
Columba plumbea 372349
Columba speciosa 372313
Columba subvinacea 115599
Columbina inca 115616
Columbina passerina 111974
Geotrygon montana 115651
Leptotila jamaicensis 135629
Leptotila plumbeiceps 135630
Leptotila rufaxilla 115657
Leptotila verreauxi 135631
Metriopelia ceciliae 115677
Oena capensis 115679
Patagioenas fasciata 372321
Phapitreron amethystina 444154
Phapitreron leucotis 135632
Ptilinopus occipitalis 135633
Streptopelia capicola 135634
Streptopelia decaocto 177147
Streptopelia senegalensis 135635
Zenaida asiatica 93382
Zenaida galapagoensis 115708
Zenaida macroura 47245



In [5]:

    
bird_genes = {}
for species, txid in bird_nametoid :
    bird_genes[species] = {}
    for genename in terms.keys() :
        try :
            bird_genes[species][genename] = find_gene( txid, genename )
        except :
            #bird_genes[species][genename] = ''
            continue
    print species, ':', ' '.join( sorted(bird_genes[species].keys()) )









    



Claravis pretiosa : COI CYTB IRBP ND2 RAG1
Columba guinea : COI CYTB ND2
Columba livia : 12S COI CYTB IRBP MB ND2 NGF NTF3 RAG1
Columba plumbea : COI CYTB ND2
Columba speciosa : COI CYTB IRBP ND2 RAG1
Columba subvinacea : COI CYTB ND2
Columbina inca : COI CYTB ND2
Columbina passerina : COI CYTB MB ND2 NGF NTF3
Geotrygon montana : COI CYTB IRBP MB ND2 NGF NTF3 RAG1
Leptotila jamaicensis : COI CYTB ND2
Leptotila plumbeiceps : COI CYTB ND2
Leptotila rufaxilla : COI CYTB IRBP ND2 RAG1
Leptotila verreauxi : COI CYTB ND2
Metriopelia ceciliae : COI CYTB ND2
Oena capensis : 12S CYTB IRBP ND2 RAG1
Patagioenas fasciata : COI CYTB ND2
Phapitreron amethystina : COI CYTB IRBP ND2 RAG1
Phapitreron leucotis : COI CYTB ND2
Ptilinopus occipitalis : 12S COI CYTB ND2
Streptopelia capicola : COI CYTB IRBP ND2 RAG1
Streptopelia decaocto : COI CYTB ND2
Streptopelia senegalensis : COI CYTB ND2
Zenaida asiatica : COI CYTB ND2
Zenaida galapagoensis : COI CYTB ND2
Zenaida macroura : COI CYTB IRBP ND2 RAG1



In [6]:

    
import pandas
from Bio.SeqIO import parse
from Bio.SeqRecord import SeqRecord
from StringIO import StringIO

def str2seq( s ) :
    if type( s ) == float :
        return SeqRecord( '' )
    return parse( StringIO( s ), 'fasta' ).next()

bird_genetable = pandas.DataFrame( bird_genes ).T
#bird_genetable.applymap( lambda x : str2seq( x ).description ).T
bird_genetable.applymap( lambda x : len(str2seq( x )) )









    Out[6]:






  
    
      
      12S
      COI
      CYTB
      IRBP
      MB
      ND2
      NGF
      NTF3
      RAG1
    
  
  
    
      Claravis pretiosa
      0
      383
      1045
      804
      0
      1041
      0
      0
      2673
    
    
      Columba guinea
      0
      383
      972
      0
      0
      1066
      0
      0
      0
    
    
      Columba livia
      446
      383
      1045
      804
      717
      1067
      802
      951
      2865
    
    
      Columba plumbea
      0
      383
      1045
      0
      0
      1092
      0
      0
      0
    
    
      Columba speciosa
      0
      383
      1045
      804
      0
      1066
      0
      0
      2733
    
    
      Columba subvinacea
      0
      379
      1045
      0
      0
      1066
      0
      0
      0
    
    
      Columbina inca
      0
      383
      1039
      0
      0
      1091
      0
      0
      0
    
    
      Columbina passerina
      0
      383
      1045
      0
      730
      1089
      736
      728
      0
    
    
      Geotrygon montana
      0
      383
      1010
      804
      740
      1066
      711
      711
      2658
    
    
      Leptotila jamaicensis
      0
      383
      1017
      0
      0
      1041
      0
      0
      0
    
    
      Leptotila plumbeiceps
      0
      383
      1045
      0
      0
      1041
      0
      0
      0
    
    
      Leptotila rufaxilla
      0
      379
      1045
      804
      0
      1041
      0
      0
      2655
    
    
      Leptotila verreauxi
      0
      383
      1045
      0
      0
      1066
      0
      0
      0
    
    
      Metriopelia ceciliae
      0
      383
      1068
      0
      0
      1092
      0
      0
      0
    
    
      Oena capensis
      389
      0
      1044
      804
      0
      1034
      0
      0
      2694
    
    
      Patagioenas fasciata
      0
      379
      1069
      0
      0
      1066
      0
      0
      0
    
    
      Phapitreron amethystina
      0
      383
      1010
      804
      0
      1031
      0
      0
      2733
    
    
      Phapitreron leucotis
      0
      383
      1045
      0
      0
      1029
      0
      0
      0
    
    
      Ptilinopus occipitalis
      400
      376
      1052
      0
      0
      1015
      0
      0
      0
    
    
      Streptopelia capicola
      0
      383
      1045
      804
      0
      1066
      0
      0
      2691
    
    
      Streptopelia decaocto
      0
      379
      1068
      0
      0
      1066
      0
      0
      0
    
    
      Streptopelia senegalensis
      0
      383
      1045
      0
      0
      1065
      0
      0
      0
    
    
      Zenaida asiatica
      0
      383
      1034
      0
      0
      1092
      0
      0
      0
    
    
      Zenaida galapagoensis
      0
      377
      1045
      0
      0
      1082
      0
      0
      0
    
    
      Zenaida macroura
      0
      382
      1045
      804
      0
      1092
      0
      0
      2733



In [7]:

    
def rename_fasta( s ) :
    if type( s ) == float :
        return SeqRecord( '' )
    fasta = parse( StringIO( s ), 'fasta' ).next()
    description = fasta.description.replace('PREDICTED:', '')
    fasta.id = '_'.join( description.split()[1:3] )
    fasta.description = ''
    return fasta.format('fasta')

g = bird_genetable['NGF'].dropna()
g.apply( rename_fasta )









    Out[7]:





Columba livia          >Columba_livia\nAAGCAGGACCCGGCTCAGCTGGCGCAGAAC...
Columbina passerina    >Columbina_passerina\nCTGTACTACACTCTGACTGTAGCT...
Geotrygon montana      >Geotrygon_montana\nTTGATCGGCACACAGGCAGCTCCAAA...
Name: NGF, dtype: object



In [8]:

    
for gene in bird_genetable.columns :
    f = open( 'bird_' + gene + '.fasta', 'w' )
    g = bird_genetable[gene].dropna()
    g = g.apply( rename_fasta )
    for record in g :
        f.write( record )
    f.close()



In [9]:

    
!clustalo -v --threads 4 -i bird_12S.fasta        -o bird_12S_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_COI.fasta        -o bird_COI_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_CYTB.fasta       -o bird_CYTB_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_FGB.fasta        -o bird_FGB_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_IRBP.fasta       -o bird_IRBP_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_MB.fasta         -o bird_MB_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_ND2.fasta        -o bird_ND2_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_NGF.fasta        -o bird_NGF_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_NTF3.fasta       -o bird_NTF3_aln.fasta
!echo
!clustalo -v --threads 4 -i bird_RAG1.fasta       -o bird_RAG1_aln.fasta









    



Using 4 threads
Read 3 sequences (type: DNA) from bird_12S.fasta
not more sequences (3) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 0.20u 0.00s 00:00:00.20 Elapsed: 00:00:00
Alignment written to bird_12S_aln.fasta

Using 4 threads
Read 24 sequences (type: DNA) from bird_COI.fasta
not more sequences (24) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.21u 0.00s 00:00:00.21 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 1.14u 0.04s 00:00:01.18 Elapsed: 00:00:00
Alignment written to bird_COI_aln.fasta

Using 4 threads
Read 25 sequences (type: DNA) from bird_CYTB.fasta
not more sequences (25) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.43u 0.00s 00:00:00.43 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 5.88u 0.41s 00:00:06.29 Elapsed: 00:00:05
Alignment written to bird_CYTB_aln.fasta

Using 4 threads
ERROR: Failed to open sequence file bird_FGB.fasta for reading
FATAL: Reading sequence file 'bird_FGB.fasta' failed

Using 4 threads
Read 9 sequences (type: DNA) from bird_IRBP.fasta
not more sequences (9) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.09u 0.00s 00:00:00.09 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 1.12u 0.06s 00:00:01.18 Elapsed: 00:00:01
Alignment written to bird_IRBP_aln.fasta

Using 4 threads
Read 3 sequences (type: Protein) from bird_MB.fasta
not more sequences (3) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 0.32u 0.02s 00:00:00.34 Elapsed: 00:00:01
Alignment written to bird_MB_aln.fasta

Using 4 threads
Read 25 sequences (type: DNA) from bird_ND2.fasta
not more sequences (25) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.33u 0.00s 00:00:00.33 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 6.81u 0.47s 00:00:07.27 Elapsed: 00:00:06
Alignment written to bird_ND2_aln.fasta

Using 4 threads
Read 3 sequences (type: DNA) from bird_NGF.fasta
not more sequences (3) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 0.36u 0.02s 00:00:00.38 Elapsed: 00:00:01
Alignment written to bird_NGF_aln.fasta

Using 4 threads
Read 3 sequences (type: DNA) from bird_NTF3.fasta
not more sequences (3) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 0.35u 0.02s 00:00:00.37 Elapsed: 00:00:00
Alignment written to bird_NTF3_aln.fasta

Using 4 threads
Read 9 sequences (type: DNA) from bird_RAG1.fasta
not more sequences (9) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.22u 0.00s 00:00:00.22 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 12.85u 1.04s 00:00:13.89 Elapsed: 00:00:14
Alignment written to bird_RAG1_aln.fasta



In [10]:

    
from Bio import AlignIO
from Bio.Nexus import Nexus
from Bio import Alphabet
from glob import glob

nexi = [ (alnfile, Nexus.Nexus( AlignIO.read( alnfile, 'fasta', alphabet=Alphabet.generic_dna  ).format('nexus') ) ) 
          for alnfile in glob( 'bird*_aln.fasta' ) ]
combined = Nexus.combine( nexi )
f = open('bird.nex', 'w')
combined.write_nexus_data(f)
f.close()

# RAxML needs a phylip file
combined.export_phylip('bird.phylip')









    Out[10]:





'bird.phylip'



In [11]:

    
# RAxML partition file
f = open( 'bird.txt', 'w' )
for key in combined.charsets.keys() :
    s = 'DNA, ' + key.split('_')[1] + ' = ' + str(combined.charsets[key][0]+1) + '-' + str(combined.charsets[key][-1]+1)
    f.write( s + '\n' )
    print s
f.close()









    



DNA, CYTB = 8181-9257
DNA, NTF3 = 4058-5008
DNA, ND2 = 7088-8180
DNA, MB = 1-740
DNA, 12S = 3606-4057
DNA, COI = 5009-5403
DNA, IRBP = 6284-7087
DNA, RAG1 = 741-3605
DNA, NGF = 5404-6283



In [12]:

    
#!raxmlHPC -m MULTICAT -q bird.txt -s bird.phylip -n bird -p 10001
!raxmlHPC -m GTRGAMMA -q bird.txt -s bird.phylip -n bird -p 10001









    



Use raxml with AVX support (2 cpus)

This is the RAxML Master Pthread

This is RAxML Worker Pthread Number: 1


This is RAxML version 8.2.9 released by Alexandros Stamatakis on July 20 2016.

With greatly appreciated code contributions by:
Andre Aberer      (HITS)
Simon Berger      (HITS)
Alexey Kozlov     (HITS)
Kassian Kobert    (HITS)
David Dao         (KIT and HITS)
Sarah Lutteropp   (KIT and HITS)
Nick Pattengale   (Sandia)
Wayne Pfeiffer    (SDSC)
Akifumi S. Tanabe (NRIFS)
Charlie Taylor    (UF)


Alignment has 1588 distinct alignment patterns

Proportion of gaps and completely undetermined characters in this alignment: 59.52%

RAxML rapid hill-climbing mode

Using 9 distinct models/data partitions with joint branch length optimization


Executing 1 inferences on the original alignment using 1 distinct randomized MP trees

All free model parameters will be estimated by RAxML
GAMMA model of rate heteorgeneity, ML estimate of alpha-parameter

GAMMA Model parameters will be estimated up to an accuracy of 0.1000000000 Log Likelihood units

Partition: 0
Alignment Patterns: 450
Name: CYTB
DataType: DNA
Substitution Matrix: GTR



Partition: 1
Alignment Patterns: 28
Name: NTF3
DataType: DNA
Substitution Matrix: GTR



Partition: 2
Alignment Patterns: 571
Name: ND2
DataType: DNA
Substitution Matrix: GTR



Partition: 3
Alignment Patterns: 38
Name: MB
DataType: DNA
Substitution Matrix: GTR



Partition: 4
Alignment Patterns: 47
Name: 12S
DataType: DNA
Substitution Matrix: GTR



Partition: 5
Alignment Patterns: 179
Name: COI
DataType: DNA
Substitution Matrix: GTR



Partition: 6
Alignment Patterns: 63
Name: IRBP
DataType: DNA
Substitution Matrix: GTR



Partition: 7
Alignment Patterns: 172
Name: RAG1
DataType: DNA
Substitution Matrix: GTR



Partition: 8
Alignment Patterns: 40
Name: NGF
DataType: DNA
Substitution Matrix: GTR




RAxML was called as follows:

/usr/bin/raxmlHPC-PTHREADS-AVX -T 2 -m GTRGAMMA -q bird.txt -s bird.phylip -n bird -p 10001 


Partition: 0 with name: CYTB
Base frequencies: 0.266 0.351 0.129 0.254 

Partition: 1 with name: NTF3
Base frequencies: 0.298 0.230 0.232 0.241 

Partition: 2 with name: ND2
Base frequencies: 0.322 0.339 0.102 0.237 

Partition: 3 with name: MB
Base frequencies: 0.264 0.240 0.236 0.260 

Partition: 4 with name: 12S
Base frequencies: 0.286 0.277 0.223 0.214 

Partition: 5 with name: COI
Base frequencies: 0.267 0.276 0.188 0.269 

Partition: 6 with name: IRBP
Base frequencies: 0.279 0.215 0.276 0.230 

Partition: 7 with name: RAG1
Base frequencies: 0.314 0.202 0.238 0.246 

Partition: 8 with name: NGF
Base frequencies: 0.233 0.290 0.279 0.198 

Inference[0]: Time 7.960409 GAMMA-based likelihood -34611.038557, best rearrangement setting 5


Conducting final model optimizations on all 1 trees under GAMMA-based models ....

Inference[0] final GAMMA-based Likelihood: -34602.555376 tree written to file /home/russell/Projects/SuchTree/data/dove-louse/RAxML_result.bird


Starting final GAMMA-based thorough Optimization on tree 0 likelihood -34602.555376 .... 

Final GAMMA-based Score of best tree -34602.555376

Program execution info written to /home/russell/Projects/SuchTree/data/dove-louse/RAxML_info.bird
Best-scoring ML tree written to: /home/russell/Projects/SuchTree/data/dove-louse/RAxML_bestTree.bird

Overall execution time: 12.941683 secs or 0.003595 hours or 0.000150 days



In [13]:

    
from ete3 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace

ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 1000
ts.show_leaf_name = False
ts.show_scale = False

nstyle = NodeStyle()
nstyle['size'] = 0 

ete_tree = Tree( 'RAxML_bestTree.bird' )

ete_tree.set_outgroup('Claravis_pretiosa')

for node in ete_tree.traverse() :
    node.set_style(nstyle)
    if node.is_leaf :
        tf = TextFace( node.name.replace('_',' ').replace('\'','') )
        tf.fsize = 10
        tf.hz_align = 100
        node.add_face( tf, 0 )

ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)









    Out[13]:

Louse tree



In [25]:

    
louse_records = []

handle = Entrez.esearch(db='nucleotide', term='AF278608:AF278643[accn]' )
record = Entrez.read(handle)
for ntid in record['IdList'] :
    fasta = Entrez.efetch( db='nucleotide', id=ntid, rettype='fasta', retmode='text' ).read()
    louse_dna = parse( StringIO( fasta ), 'fasta' ).next()
    print louse_dna.description
    louse_records.append( louse_dna )
    
handle = Entrez.esearch(db='nucleotide', term='AF190409:AF190426[accn]' )
record = Entrez.read(handle)
for ntid in record['IdList'] :
    fasta = Entrez.efetch( db='nucleotide', id=ntid, rettype='fasta', retmode='text' ).read()
    louse_dna = parse( StringIO( fasta ), 'fasta' ).next()
    print louse_dna.description
    louse_records.append( louse_dna )

AF279704:AF279743
AF278608:AF278643
AY151003:AY151026









    



AF278643.1 Columbicola baculoides elongation factor 1 alpha gene, partial cds
AF278642.1 Columbicola passerinae haplotype 2 elongation factor 1 alpha gene, partial cds
AF278641.1 Columbicola passerinae haplotype 1 elongation factor 1 alpha gene, partial cds
AF278640.1 Columbicola theresae elongation factor 1 alpha gene, partial cds
AF278639.1 Columbicola gracilicapitis elongation factor 1 alpha gene, partial cds
AF278638.1 Columbicola xavieri elongation factor 1 alpha gene, partial cds
AF278637.1 Columbicola deboomi elongation factor 1 alpha gene, partial cds
AF278636.1 Columbicola columbae haplotype 2 elongation factor 1 alpha gene, partial cds
AF278635.1 Columbicola columbae haplotype 1 elongation factor 1 alpha gene, partial cds
AF278634.1 Columbicola sp. KPJ-2000 elongation factor 1 alpha gene, partial cds
AF278633.1 Columbicola macrourae haplotype 3 elongation factor 1 alpha gene, partial cds
AF278632.1 Columbicola macrourae haplotype 4 elongation factor 1 alpha gene, partial cds
AF278631.1 Columbicola macrourae haplotype 2 elongation factor 1 alpha gene, partial cds
AF278630.1 Columbicola macrourae haplotype 1 elongation factor 1 alpha gene, partial cds
AF278629.1 Columbicola adamsi elongation factor 1 alpha gene, partial cds
AF278628.1 Columbicola baculoides cytochrome oxidase subunit I gene, partial cds; mitochondrial gene for mitochondrial product
AF278627.1 Columbicola passerinae haplotype 2 cytochrome oxidase subunit I gene, partial cds; mitochondrial gene for mitochondrial product
AF278626.1 Columbicola passerinae haplotype 1 cytochrome oxidase subunit I gene, partial cds; mitochondrial gene for mitochondrial product
AF278625.1 Columbicola theresae cytochrome oxidase subunit I gene, partial cds; mitochondrial gene for mitochondrial product
AF278624.1 Columbicola gracilicapitis cytochrome oxidase subunit I gene, partial cds; mitochondrial gene for mitochondrial product
AF190415.1 Columbicola columbae haplotype 1 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190426.1 Erwinia herbicola aspartate transcarbamylase gene cluster pyrLBI, complete sequence
AF190425.1 Columbicola baculoides 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190424.1 Columbicola gracilicapitis 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190423.1 Columbicola macrourae haplotype 3 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190422.1 Columbicola macrourae haplotype 1 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190421.1 Columbicola macrourae haplotype 1 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190420.1 Columbicola macrourae haplotype 1 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190419.1 Columbicola adamsi 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190418.1 Columbicola adamsi 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190417.1 Columbicola macrourae haplotype 2 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190416.1 Columbicola macrourae haplotype 2 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190414.1 Columbicola columbae haplotype 1 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190413.1 Columbicola columbae haplotype 1 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190412.1 Columbicola columbae haplotype 1 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190411.1 Columbicola passerinae haplotype 2 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190410.1 Columbicola passerinae haplotype 1 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product
AF190409.1 Columbicola passerinae haplotype 1 12S ribosomal RNA gene, partial sequence; mitochondrial gene for mitochondrial product



In [66]:

    
louse_gene_names = { '12S' : '12S ribosomal RNA', 
                     'COI' : 'cytochrome oxidase subunit I',
                     'EF1'  : 'elongation factor' }

louse_genes = {}

for record in louse_records :
    tokens = record.description.split()
    name = [ '_'.join(tokens[1:3]) ]
    if tokens.__contains__( 'haplotype' ) :
        i = tokens.index('haplotype')
        haplotype = '_'.join( tokens[i:i+2] )
        name.append( haplotype )
    name = '_'.join( name )
    
    if not name in louse_genes :
        louse_genes[name] = {}
    
    gene_name = None
    for key in louse_gene_names.keys() :
        if record.description.__contains__( louse_gene_names[key] ) :
            louse_genes[name][key] = record.format('fasta')


louse_genetable = pandas.DataFrame( louse_genes ).T
louse_genetable.applymap( lambda x : len( str2seq(x) ) )









    Out[66]:






  
    
      
      12S
      COI
      EF1
    
  
  
    
      Columbicola_adamsi
      337
      0
      328
    
    
      Columbicola_baculoides
      348
      386
      350
    
    
      Columbicola_columbae_haplotype_1
      335
      0
      350
    
    
      Columbicola_columbae_haplotype_2
      0
      0
      350
    
    
      Columbicola_deboomi
      0
      0
      350
    
    
      Columbicola_gracilicapitis
      343
      383
      350
    
    
      Columbicola_macrourae_haplotype_1
      349
      0
      350
    
    
      Columbicola_macrourae_haplotype_2
      339
      0
      317
    
    
      Columbicola_macrourae_haplotype_3
      339
      0
      350
    
    
      Columbicola_macrourae_haplotype_4
      0
      0
      350
    
    
      Columbicola_passerinae_haplotype_1
      337
      383
      317
    
    
      Columbicola_passerinae_haplotype_2
      345
      383
      317
    
    
      Columbicola_sp.
      0
      0
      340
    
    
      Columbicola_theresae
      0
      383
      350
    
    
      Columbicola_xavieri
      0
      0
      350
    
    
      Erwinia_herbicola
      0
      0
      0



In [67]:

    
# not sure why Erwinia herbicola is in there...
louse_genetable = louse_genetable.drop('Erwinia_herbicola')



In [76]:

    
for gene in louse_genetable.columns :
    f = open( 'louse_' + gene + '.fasta', 'w' )
    g = louse_genetable[gene].dropna()
    for name, record in g.iteritems() :
        record = parse( StringIO( record ), 'fasta' ).next()
        record.id = name
        record.description = ''
        record.name = ''
        f.write( record.format('fasta') )
    f.close()



In [75]:

    
!clustalo -v --threads 4 -i louse_12S.fasta        -o louse_12S_aln.fasta
!echo
!clustalo -v --threads 4 -i louse_COI.fasta        -o louse_COI_aln.fasta
!echo
!clustalo -v --threads 4 -i louse_EF1.fasta        -o louse_EF1_aln.fasta
!echo









    



Using 4 threads
Read 9 sequences (type: DNA) from louse_12S.fasta
not more sequences (9) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.03u 0.00s 00:00:00.03 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 0.64u 0.01s 00:00:00.65 Elapsed: 00:00:00
Alignment written to louse_12S_aln.fasta

Using 4 threads
Read 5 sequences (type: DNA) from louse_COI.fasta
not more sequences (5) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.02u 0.00s 00:00:00.02 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 0.43u 0.00s 00:00:00.43 Elapsed: 00:00:00
Alignment written to louse_COI_aln.fasta

Using 4 threads
Read 15 sequences (type: DNA) from louse_EF1.fasta
not more sequences (15) than cluster-size (100), turn off mBed
Calculating pairwise ktuple-distances...
Ktuple-distance calculation progress done. CPU time: 0.08u 0.00s 00:00:00.08 Elapsed: 00:00:00
Guide-tree computation done.
Progressive alignment progress done. CPU time: 0.63u 0.02s 00:00:00.65 Elapsed: 00:00:00
Alignment written to louse_EF1_aln.fasta



In [77]:

    
from Bio import AlignIO
from Bio.Nexus import Nexus
from Bio import Alphabet
from glob import glob

nexi = [ (alnfile, Nexus.Nexus( AlignIO.read( alnfile, 'fasta', alphabet=Alphabet.generic_dna  ).format('nexus') ) ) 
          for alnfile in glob( 'louse*_aln.fasta' ) ]
combined = Nexus.combine( nexi )
f = open('louse.nex', 'w')
combined.write_nexus_data(f)
f.close()

# RAxML needs a phylip file
combined.export_phylip('louse.phylip')









    Out[77]:





'louse.phylip'



In [78]:

    
# RAxML partition file
f = open( 'louse.txt', 'w' )
for key in combined.charsets.keys() :
    s = 'DNA, ' + key.split('_')[1] + ' = ' + str(combined.charsets[key][0]+1) + '-' + str(combined.charsets[key][-1]+1)
    f.write( s + '\n' )
    print s
f.close()









    



DNA, 12S = 1-361
DNA, EF1 = 748-1124
DNA, COI = 362-747



In [79]:

    
!raxmlHPC -m GTRGAMMA -q louse.txt -s louse.phylip -n louse -p 10001









    



Use raxml with AVX support (2 cpus)

This is the RAxML Master Pthread

This is RAxML Worker Pthread Number: 1


This is RAxML version 8.2.9 released by Alexandros Stamatakis on July 20 2016.

With greatly appreciated code contributions by:
Andre Aberer      (HITS)
Simon Berger      (HITS)
Alexey Kozlov     (HITS)
Kassian Kobert    (HITS)
David Dao         (KIT and HITS)
Sarah Lutteropp   (KIT and HITS)
Nick Pattengale   (Sandia)
Wayne Pfeiffer    (SDSC)
Akifumi S. Tanabe (NRIFS)
Charlie Taylor    (UF)


Alignment has 401 distinct alignment patterns

Proportion of gaps and completely undetermined characters in this alignment: 40.17%

RAxML rapid hill-climbing mode

Using 3 distinct models/data partitions with joint branch length optimization


Executing 1 inferences on the original alignment using 1 distinct randomized MP trees

All free model parameters will be estimated by RAxML
GAMMA model of rate heteorgeneity, ML estimate of alpha-parameter

GAMMA Model parameters will be estimated up to an accuracy of 0.1000000000 Log Likelihood units

Partition: 0
Alignment Patterns: 192
Name: 12S
DataType: DNA
Substitution Matrix: GTR



Partition: 1
Alignment Patterns: 114
Name: EF1
DataType: DNA
Substitution Matrix: GTR



Partition: 2
Alignment Patterns: 95
Name: COI
DataType: DNA
Substitution Matrix: GTR




RAxML was called as follows:

/usr/bin/raxmlHPC-PTHREADS-AVX -T 2 -m GTRGAMMA -q louse.txt -s louse.phylip -n louse -p 10001 


Partition: 0 with name: 12S
Base frequencies: 0.364 0.149 0.188 0.299 

Partition: 1 with name: EF1
Base frequencies: 0.266 0.247 0.240 0.248 

Partition: 2 with name: COI
Base frequencies: 0.267 0.141 0.223 0.368 

Inference[0]: Time 0.756426 GAMMA-based likelihood -5374.702861, best rearrangement setting 5


Conducting final model optimizations on all 1 trees under GAMMA-based models ....

Inference[0] final GAMMA-based Likelihood: -5373.769314 tree written to file /home/russell/Projects/SuchTree/data/dove-louse/RAxML_result.louse


Starting final GAMMA-based thorough Optimization on tree 0 likelihood -5373.769314 .... 

Final GAMMA-based Score of best tree -5373.769209

Program execution info written to /home/russell/Projects/SuchTree/data/dove-louse/RAxML_info.louse
Best-scoring ML tree written to: /home/russell/Projects/SuchTree/data/dove-louse/RAxML_bestTree.louse

Overall execution time: 1.108562 secs or 0.000308 hours or 0.000013 days



In [80]:

    
from ete3 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace

ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 1000
ts.show_leaf_name = False
ts.show_scale = False

nstyle = NodeStyle()
nstyle['size'] = 0

ete_tree = Tree( 'RAxML_bestTree.louse' )

#ete_tree.set_outgroup('Claravis_pretiosa')

for node in ete_tree.traverse() :
    node.set_style(nstyle)
    if node.is_leaf :
        tf = TextFace( node.name.replace('_',' ').replace('\'','') )
        tf.fsize = 10
        tf.hz_align = 100
        node.add_face( tf, 0 )

ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)









    Out[80]:



In [87]:

    
#map( lambda x : x.replace(' ','_'), bird_genetable.index )
list( louse_genetable.index )









    Out[87]:





['Columbicola_adamsi',
 'Columbicola_baculoides',
 'Columbicola_columbae_haplotype_1',
 'Columbicola_columbae_haplotype_2',
 'Columbicola_deboomi',
 'Columbicola_gracilicapitis',
 'Columbicola_macrourae_haplotype_1',
 'Columbicola_macrourae_haplotype_2',
 'Columbicola_macrourae_haplotype_3',
 'Columbicola_macrourae_haplotype_4',
 'Columbicola_passerinae_haplotype_1',
 'Columbicola_passerinae_haplotype_2',
 'Columbicola_sp.',
 'Columbicola_theresae',
 'Columbicola_xavieri']



In [ ]:

    
{'Claravis_pretiosa'         : ['Columbicola_passerinae_haplotype_2'],
 'Columba_guinea'            : ['Columbicola_columbae_haplotype_2'],
 'Columba_livia'             : ['Columbicola_columbae_haplotype_1'],
 'Columba_plumbea'           : ['Columbicola_adamsi'],
 'Columba_speciosa'          : ['Columbicola_adamsi'],
 'Columba_subvinacea'        : [],
 'Columbina_inca'            : ['Columbicola_passerinae_haplotype_1'],
 'Columbina_passerina'       : ['Columbicola_passerinae_haplotype_1'],
 'Geotrygon_montana'         : [],
 'Leptotila_jamaicensis'     : [],
 'Leptotila_plumbeiceps'     : [],
 'Leptotila_rufaxilla'       : [],
 'Leptotila_verreauxi'       : [],
 'Metriopelia_ceciliae'      : [],
 'Oena_capensis'             : [],
 'Patagioenas_fasciata'      : [],
 'Phapitreron_amethystina'   : [],
 'Phapitreron_leucotis'      : [],
 'Ptilinopus_occipitalis'    : [],
 'Streptopelia_capicola'     : [],
 'Streptopelia_decaocto'     : [],
 'Streptopelia_senegalensis' : [],
 'Zenaida_asiatica'          : [],
 'Zenaida_galapagoensis'     : [],
 'Zenaida_macroura'          : [] }

	12S	COI	CYTB	IRBP	MB	ND2	NGF	NTF3	RAG1
Claravis pretiosa	0	383	1045	804	0	1041	0	0	2673
Columba guinea	0	383	972	0	0	1066	0	0	0
Columba livia	446	383	1045	804	717	1067	802	951	2865
Columba plumbea	0	383	1045	0	0	1092	0	0	0
Columba speciosa	0	383	1045	804	0	1066	0	0	2733
Columba subvinacea	0	379	1045	0	0	1066	0	0	0
Columbina inca	0	383	1039	0	0	1091	0	0	0
Columbina passerina	0	383	1045	0	730	1089	736	728	0
Geotrygon montana	0	383	1010	804	740	1066	711	711	2658
Leptotila jamaicensis	0	383	1017	0	0	1041	0	0	0
Leptotila plumbeiceps	0	383	1045	0	0	1041	0	0	0
Leptotila rufaxilla	0	379	1045	804	0	1041	0	0	2655
Leptotila verreauxi	0	383	1045	0	0	1066	0	0	0
Metriopelia ceciliae	0	383	1068	0	0	1092	0	0	0
Oena capensis	389	0	1044	804	0	1034	0	0	2694
Patagioenas fasciata	0	379	1069	0	0	1066	0	0	0
Phapitreron amethystina	0	383	1010	804	0	1031	0	0	2733
Phapitreron leucotis	0	383	1045	0	0	1029	0	0	0
Ptilinopus occipitalis	400	376	1052	0	0	1015	0	0	0
Streptopelia capicola	0	383	1045	804	0	1066	0	0	2691
Streptopelia decaocto	0	379	1068	0	0	1066	0	0	0
Streptopelia senegalensis	0	383	1045	0	0	1065	0	0	0
Zenaida asiatica	0	383	1034	0	0	1092	0	0	0
Zenaida galapagoensis	0	377	1045	0	0	1082	0	0	0
Zenaida macroura	0	382	1045	804	0	1092	0	0	2733

	12S	COI	EF1
Columbicola_adamsi	337	0	328
Columbicola_baculoides	348	386	350
Columbicola_columbae_haplotype_1	335	0	350
Columbicola_columbae_haplotype_2	0	0	350
Columbicola_deboomi	0	0	350
Columbicola_gracilicapitis	343	383	350
Columbicola_macrourae_haplotype_1	349	0	350
Columbicola_macrourae_haplotype_2	339	0	317
Columbicola_macrourae_haplotype_3	339	0	350
Columbicola_macrourae_haplotype_4	0	0	350
Columbicola_passerinae_haplotype_1	337	383	317
Columbicola_passerinae_haplotype_2	345	383	317
Columbicola_sp.	0	0	340
Columbicola_theresae	0	383	350
Columbicola_xavieri	0	0	350
Erwinia_herbicola	0	0	0