Reverse Ecology and Metatranscriptomics of Uncultivated Freshwater Actinobacteria

Figure 1: Phylogenetic Tree

Overview

Construction of the phylogenetic tree requires the following steps:

  1. Download reference genomes.
  2. Extract concatenated marker gene sequences using Phylosift.
  3. Construct phylogenetic tree using RAxML.
  4. Visualize phylogenetic tree using the ETE Toolkit for Python.

Step 1: Downloading Reference Genomes

The reference genomes used in this study are available through IMG, Genbank, and a personal website, as described in Table 1. The outgroup genomes used in the phylogenetic tree are available through IMG, as described in Table S17 in the Supplemental Online Material. The genome sequences need to be downloaded in FASTA nucleotide format and placed in data/refGenomes/fna. One file per genome, of the form genome.fna.

Step 2: Extract concatenated marker gene sequences using Phylosift


In [ ]:
%reset
# Import packages
from Bio import SeqIO
import os
import subprocess

# Define folders and paths
genomeDir = '../data/refGenomes/fna'
outgroupDir = '../data/refGenomes/outgroups'
scratchDir = '../data/scratch'
psInputDir = scratchDir+'/psInput'
resultsDir = '../figures'
treeDir = resultsDir+'/fig1-tree'
phylosiftPath = '/Applications/phylosift/phylosift'

# Check that psInputDir exists: genomes for use with Phylosift will get
# transferred there
if not os.path.exists(psInputDir):
    os.makedirs(psInputDir)
    
# Check that resultsDir and treeDir exist: alignment for use with Raxml will get
# transferred there
if not os.path.exists(resultsDir):
    os.makedirs(resultsDir)
if not os.path.exists(treeDir):
    os.makedirs(treeDir)

# Retrieve the list of genomes to process
genomeList = []
for item in os.listdir(genomeDir):
    if item.endswith('.fna'):
        genomeList.append(item)
        
genomeList = [genome.replace('.fna', '') for genome in genomeList]
        
outgroupList = []
for item in os.listdir(outgroupDir):
    if item.endswith('.fna'):
        outgroupList.append(item)

outgroupList = [genome.replace('.fna', '') for genome in outgroupList]

# For each genome, concatenate individual contigs. Repeat for refGenomes and outgroups.
for genome in genomeList:
    inFile = open(genomeDir+'/'+genome+'.fna')
    outFile = open(psInputDir+'/'+genome+'.fna', "w")
    
    # Create a header for the fasta file
    print >>outFile, '>'+genome
    
    # Concatenate all the sequences
    seqList = []
    for seqRecord in SeqIO.parse(inFile, 'fasta'):
        seqList.append(seqRecord.seq)
        
    concatSeq = "".join([str(seqRec) for seqRec in seqList])
    print >>outFile, concatSeq
    
    inFile.close()
    outFile.close()

for genome in outgroupList:
    inFile = open(outgroupDir+'/'+genome+'.fna')
    outFile = open(psInputDir+'/'+genome+'.fna', "w")
    
    # Create a header for the fasta file
    print >>outFile, '>'+genome
    
    # Concatenate all the sequences
    seqList = []
    for seqRecord in SeqIO.parse(inFile, 'fasta'):
        seqList.append(seqRecord.seq)
        
    concatSeq = "".join([str(seqRec) for seqRec in seqList])
    print >>outFile, concatSeq
    
    inFile.close()
    outFile.close()
    
print "Processing complete."
print "Running phylosift."
    
# Run Phylosift search and alignment steps
for genome in genomeList:
   print "Running search step on genome %s (%s of %s)" % (genome, genomeList.index(genome)+1, len(genomeList))
   subprocess.call([phylosiftPath, 'search', '--besthit', psInputDir+'/'+genome+'.fna'])
   
   print "Running align step on genome %s (%s of %s)" % (genome, genomeList.index(genome)+1, len(genomeList))
   subprocess.call([phylosiftPath, 'align', '--besthit', psInputDir+'/'+genome+'.fna'])
    
for genome in outgroupList:
   print "Running search step on outgroup %s (%s of %s)" % (genome, outgroupList.index(genome)+1, len(outgroupList))
   subprocess.call([phylosiftPath, 'search', '--besthit', psInputDir+'/'+genome+'.fna'])
   
   print "Running align step on outgroup %s (%s of %s)" % (genome, outgroupList.index(genome)+1, len(outgroupList))
   subprocess.call([phylosiftPath, 'align', '--besthit', psInputDir+'/'+genome+'.fna'])
   
print "Phylosift complete."

# Create concatenated alignment file.

outFile = open(treeDir+'/'+'tree.aln', "w")

for genome in genomeList+outgroupList:
    inFile = open('PS_temp'+'/'+genome+'.fna'+'/alignDir/concat.updated.1.fasta')
    
    print >>outFile, '>'+genome
    for seqRecord in SeqIO.parse(inFile, 'fasta'):
        print >>outFile, seqRecord.seq
        
    inFile.close()
outFile.close()

print "Concatenation complete."

Step 3: Construct phylogenetic tree using RAxML

RAxML will take a long time to run if run on a personal computer. Instead, you can run FastTree to get approximately the same results.


In [ ]:
%reset
# Import packages
import os
import shutil
import subprocess

# Define folders and paths
resultsDir = '../figures'
treeDir = resultsDir+'/fig1-tree'
fastTreePath = '/Applications/phylosift/bin/FastTree'
raxmlPath = '/Applications/RAxML/raxmlHPC-PTHREADS-AVX'

# Run FastTree if you're running this on a personal computer
# The command is: /Applications/phylosift/bin/FastTree -out scratch/raxml/tree.nwk scratch/raxml/tree.aln 
subprocess.call([fastTreePath, '-out', treeDir+'/tree.nwk', 
                 treeDir+'/tree.aln'])

# Actual RAxML command run on a server
# The command is: /Applications/RAxML/raxmlHPC-PTHREADS-AVX -f a -m PROTGAMMAAUTO -p 12345 -x 12345 -# 100 -s results/fig1-tree/tree.aln -T 24 -n tree
#subprocess.call([raxmlPath, '-f', 'a', '-m',  
#                 'PROTGAMMAAUTO', '-p', '12345', '-x', '12345', '-#', '100', 
#                 '-s', treeDir+'/tree.aln', '-T', '24', '-n', 'tree'])

# Move the results of the RAxML run to treeDir
for myFile in os.listdir('.'):
    if '.tree' in myFile:
        shutil.move(myFile, treeDir)

# Rename RAxML_bipartitions.tree into tree.aln
shutil.move(treeDir+'/RAxML_bipartitions.tree', treeDir+'/tree.nwk')

Step 4: Visualize phylogenetic tree using ETE Toolkit


In [ ]:
%reset
# Import packages
import ete3
import re
import os

# Define folders
externalDataDir = '../data/externalData'
resultsDir = '../figures'
treeDir = resultsDir+'/fig1-tree'

# Check that treeDir exists
if not os.path.exists(treeDir):
    os.makedirs(treeDir)
    
# Read in the phylogenetic tree
actinoTree = ete3.Tree(treeDir+'/tree.nwk')

# Set the outgroup
actinoTree.set_outgroup('Pnec')

# Define the node style
def nodeStyleFunct(node):
    nodeStyle = ete3.NodeStyle()
    nodeStyle['size'] = 5
    nodeStyle['shape'] = 'circle'
    if node.support >= 90:
        nodeStyle['fgcolor'] = 'black'
    elif node.support >= 70:
        nodeStyle['fgcolor'] = 'grey'
    else:
        nodeStyle['size'] = 0
    # Apply to all nodes
    node.set_style(nodeStyle)

# Label internal nodes with clade names
# Create a dict showing the leaves assoc with each clade
cladeNodeDict = {}
with open(externalDataDir+'/cladeLeaves.csv', 'rU') as inFile:
    for line in inFile:
        line = re.sub('["\n]', '', line)
        key = line.split(',')[0]
        value = line.split(',')[1:]
        cladeNodeDict[key] = value    
# Add labels to internal nodes, based on their descendants
for key in cladeNodeDict.keys():
    for node in actinoTree.traverse():
        if set(node.get_leaf_names()) == set(cladeNodeDict[key]):
            node.add_face(ete3.TextFace(key, fgcolor='blue'), 0, 'branch-top')

# Update internal node names with proper genome/species name
# Create a dict mapping node to species names
leafNameDict = {}
with open(externalDataDir+'/leafNames.csv', 'rU') as inFile:
    for line in inFile:
        [key, value] = line.split(',')
        leafNameDict[key] = value.strip()
# Place genome names at the right edge of the tree and align        
for node in actinoTree.iter_leaves():
# If node describes a species, make it italic
    if len(leafNameDict[node.name].split(' ')) > 1:
        node.add_face(ete3.TextFace(leafNameDict[node.name], fstyle='italic'), 0, 'branch-right')
    else:
        node.add_face(ete3.TextFace(leafNameDict[node.name]), 0, 'branch-right')

# Create a column on the right-hand side showing taxonomy of outgroups
rhsDict = {}
with open(externalDataDir+'/taxonNames.csv', 'rU') as inFile:
    for line in inFile:
        [key, value] = line.split(',')
        rhsDict[key] = value.strip()
# Place genome names at the right edge of the tree and align        
for node in actinoTree.iter_leaves():
    if node.name in rhsDict.keys():
        node.add_face(ete3.TextFace(rhsDict[node.name]), 0, 'aligned')
    
# Define the tree style
treeStyle = ete3.TreeStyle()
treeStyle.layout_fn = nodeStyleFunct
#treeStyle.show_branch_support = True
treeStyle.branch_vertical_margin = 10 # vertical spacing
#treeStyle.draw_guiding_lines = True
treeStyle.show_leaf_name = False

# Render the phylogenetic tree
#actinoTree.render("%%inline", tree_style = treeStyle)
actinoTree.render(treeDir+'/tree-full.svg', tree_style = treeStyle)

Simplify Tree for Manuscript

This bit of code simplifies the tree to retain only a single Actinomycetales genome as the outgroup. The figure is then touched up in Adobe Illustrator to give the final figure presented in the mansucript.


In [ ]:
%reset
# Import packages
import ete3
import re
import os

# Define folders
externalDataDir = '../data/externalData'
resultsDir = '../figures'
treeDir = resultsDir+'/fig1-tree'

# Check that treeDir exists
if not os.path.exists(treeDir):
    os.makedirs(treeDir)
    
# Read in the phylogenetic tree
actinoTree = ete3.Tree(treeDir+'/tree.nwk')

# Define the node style
def nodeStyleFunct(node):
    nodeStyle = ete3.NodeStyle()
    nodeStyle['size'] = 5
    nodeStyle['shape'] = 'circle'
    if node.support >= 90:
        nodeStyle['fgcolor'] = 'black'
    elif node.support >= 70:
        nodeStyle['fgcolor'] = 'grey'
    else:
        nodeStyle['size'] = 0
    # Apply to all nodes
    node.set_style(nodeStyle)

# # Label internal nodes with clade names
# # Create a dict showing the leaves assoc with each clade
# cladeNodeDict = {}
# with open(externalDataDir+'/cladeLeaves.csv', 'rU') as inFile:
#     for line in inFile:
#         line = re.sub('["\n]', '', line)
#         key = line.split(',')[0]
#         value = line.split(',')[1:]
#         cladeNodeDict[key] = value    
# # Add labels to internal nodes, based on their descendants
# for key in cladeNodeDict.keys():
#     for node in actinoTree.traverse():
#         if set(node.get_leaf_names()) == set(cladeNodeDict[key]):
#             node.add_face(ete3.TextFace(key, fgcolor='blue'), 0, 'branch-top')

# Update internal node names with proper genome/species name
# Create a dict mapping node to species names
leafNameDict = {}
with open(externalDataDir+'/leafNames.csv', 'rU') as inFile:
    for line in inFile:
        [key, value] = line.split(',')
        leafNameDict[key] = value.strip()
# Place genome names at the right edge of the tree and align        
for node in actinoTree.iter_leaves():
# If node describes a species, make it italic
    if len(leafNameDict[node.name].split(' ')) > 1:
        node.add_face(ete3.TextFace(leafNameDict[node.name], fstyle='italic'), 0, 'branch-right')
    else:
        node.add_face(ete3.TextFace(leafNameDict[node.name]), 0, 'branch-right')

# Create a column on the right-hand side showing taxonomy of outgroups
rhsDict = {}
with open(externalDataDir+'/taxonNames.csv', 'rU') as inFile:
    for line in inFile:
        [key, value] = line.split(',')
        rhsDict[key] = value.strip()
# Place genome names at the right edge of the tree and align        
for node in actinoTree.iter_leaves():
    if node.name in rhsDict.keys():
        node.add_face(ete3.TextFace(rhsDict[node.name]), 0, 'aligned')
    
# Define the tree style
treeStyle = ete3.TreeStyle()
treeStyle.layout_fn = nodeStyleFunct
#treeStyle.show_branch_support = True
treeStyle.branch_vertical_margin = 10 # vertical spacing
#treeStyle.draw_guiding_lines = True
treeStyle.show_leaf_name = False

# Drop individual nodes
nodeDeleteList = ['Pnec', '637000248', '646311917', '2503538010', '646564571', 
                  '648028042', '646311931', '2506783060', '639633044', '646564587', 
                  '646311932', '651053061', '644736393', '2511231200', '644736339', 
                  '646311938', '646361958', '646311958', 
                  '648028043', '2585427650', '2639763019', '2576861448', '646564505',
                  '2505679089', '646564520', '646311968', '644736376', '646564565',
                  '644736331', '640753031', '643348509', '644736322', '639633001', 
                  '650377905', '643692008', '643348516', '646564566', '2639762896']

for node in actinoTree.iter_leaves():
    if node.name in nodeDeleteList:
        node.delete(preserve_branch_length=True)

# Set the outgroup
actinoTree.set_outgroup('2684623058')

# Render the phylogenetic tree
#actinoTree.render("%%inline", tree_style = treeStyle)
actinoTree.render(treeDir+'/tree-ms.svg', tree_style = treeStyle, w=9, units='in')

Simplify Tree for Supplement

This bit of code simplifies the tree to include only a single genome for each freshwater tribe and for the outgroup classes. The figure is then touched up in Adobe Illustrator to give the final figure presented in the mansucript.


In [ ]:
%reset
# Import packages
import ete3
import re
import os

# Define folders
externalDataDir = '../data/externalData'
resultsDir = '../figures'
treeDir = resultsDir+'/fig1-tree'

# Check that treeDir exists
if not os.path.exists(treeDir):
    os.makedirs(treeDir)
    
# Read in the phylogenetic tree
actinoTree = ete3.Tree(treeDir+'/tree.nwk')

# Define the node style
def nodeStyleFunct(node):
    nodeStyle = ete3.NodeStyle()
    nodeStyle['size'] = 5
    nodeStyle['shape'] = 'circle'
    if node.support >= 90:
        nodeStyle['fgcolor'] = 'black'
    elif node.support >= 70:
        nodeStyle['fgcolor'] = 'grey'
    else:
        nodeStyle['size'] = 0
    # Apply to all nodes
    node.set_style(nodeStyle)

# # Label internal nodes with clade names
# # Create a dict showing the leaves assoc with each clade
# cladeNodeDict = {}
# with open(externalDataDir+'/cladeLeaves.csv', 'rU') as inFile:
#     for line in inFile:
#         line = re.sub('["\n]', '', line)
#         key = line.split(',')[0]
#         value = line.split(',')[1:]
#         cladeNodeDict[key] = value    
# # Add labels to internal nodes, based on their descendants
# for key in cladeNodeDict.keys():
#     for node in actinoTree.traverse():
#         if set(node.get_leaf_names()) == set(cladeNodeDict[key]):
#             node.add_face(ete3.TextFace(key, fgcolor='blue'), 0, 'branch-top')

# Update internal node names with proper genome/species name
# Create a dict mapping node to species names
leafNameDict = {}
with open(externalDataDir+'/leafNames.csv', 'rU') as inFile:
    for line in inFile:
        [key, value] = line.split(',')
        leafNameDict[key] = value.strip()
# Place genome names at the right edge of the tree and align        
for node in actinoTree.iter_leaves():
# If node describes a species, make it italic
    if len(leafNameDict[node.name].split(' ')) > 1:
        node.add_face(ete3.TextFace(leafNameDict[node.name], fstyle='italic'), 0, 'branch-right')
    else:
        node.add_face(ete3.TextFace(leafNameDict[node.name]), 0, 'branch-right')

# Create a column on the right-hand side showing taxonomy of outgroups
rhsDict = {}
with open(externalDataDir+'/taxonNames.csv', 'rU') as inFile:
    for line in inFile:
        [key, value] = line.split(',')
        rhsDict[key] = value.strip()
# Place genome names at the right edge of the tree and align        
for node in actinoTree.iter_leaves():
    if node.name in rhsDict.keys():
        node.add_face(ete3.TextFace(rhsDict[node.name]), 0, 'aligned')
    
# Define the tree style
treeStyle = ete3.TreeStyle()
treeStyle.layout_fn = nodeStyleFunct
#treeStyle.show_branch_support = True
treeStyle.branch_vertical_margin = 10 # vertical spacing
#treeStyle.draw_guiding_lines = True
treeStyle.show_leaf_name = False

# Drop individual nodes
nodeDeleteList = ['Pnec', '637000248', '646311917', '2503538010', '646564571', 
                  '648028042', '646311931', '2506783060', '639633044', '646564587', 
                  '646311932', '651053061', '644736393', '2511231200', '644736339', 
                  '646311938', '646361958', '646311958', 
                  '648028043', '2585427650', '2639763019', '2576861448', '646564505',
                  '2505679089', '646564520', '646311968', '644736376', '646564565',
                  '644736331', '640753031', '643348509',
                 'MEint885', 'BIN_10',
                 'AAA278O22', 'AAA023J06', 'AAA024D14', 'AAA041L13', 'AAA028E20', 'AAA044O16',
                 'AAA044D11', 'acIB-AMD-6', 'AAA028A23', 'AAA027L06', 'AB141P03', 'AAA027J17',
                  'AAA023D18', 'AAA278I18', 'MEint283']

for node in actinoTree.iter_leaves():
    if node.name in nodeDeleteList:
        node.delete(preserve_branch_length=True)

# Set the outgroup
actinoTree.set_outgroup('644736322')

# Render the phylogenetic tree
#actinoTree.render("%%inline", tree_style = treeStyle)
actinoTree.render(treeDir+'/tree-suppl.svg', tree_style = treeStyle, w=9, units='in')

In [ ]: