author: lukethompson@gmail.com
date: 16 Nov 2016
language: Python 2.7
license: BSD3

blast_xml_to_taxonomy.ipynb

Takes the XML output of blastn (query: Deblur OTU, database: RDP Release 11, percent ID: 100%), parses it, and creates a file with the query, top RDP lineage (with number of hits having that lineage over total hits), and top-3 RDP species (with number of hits having that species over total hits).


In [1]:
import pandas as pd
import numpy as np
import Bio.Blast.NCBIXML
from cStringIO import StringIO
from __future__ import print_function

In [5]:
# convert RDP-style lineage to Greengenes-style lineage
def rdp_lineage_to_gg(lineage):
    d = {}
    linlist = lineage.split(';')
    for i in np.arange(0, len(linlist), 2):
        d[linlist[i+1]] = linlist[i]
    linstr = ''
    for level in ['domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus']:
        try:
            linstr += level[0] + '__' + d[level].replace('"', '') + '; '
        except:
            linstr += level[0] + '__' + '; '
    linstr = linstr[:-2]
    return(linstr)

In [6]:
# parse blast xml record
def parse_record_alignments_taxonomy(record):
    df = pd.DataFrame(columns=('strain', 'lineage'))
    for alignment in record.alignments:
        strain, lineage = alignment.hit_def.split('   ')
        linstr = rdp_lineage_to_gg(lineage)
        df = df.append({'strain': strain, 'lineage': linstr}, ignore_index=True)
    df['species'] = [(x.split(' ')[0] + ' ' + x.split(' ')[1]).replace(';', '') for x in df.strain]
    num_hits = df.shape[0]
    vc_species = df.species.value_counts()
    vc_lineage = df.lineage.value_counts()
    return(num_hits, vc_species, vc_lineage)

In [7]:
# main function
def xml_to_taxonomy(path_xml, path_output):
    # read file as single string, generate handle, and parse xml handle to records generator
    with open(path_xml) as file:
        str_xml = file.read()
    handle_xml = StringIO(str_xml)
    records = Bio.Blast.NCBIXML.parse(handle_xml)

    # write top lineage and top 3 strains for each query
    with open(path_output, 'w') as target:
        # write header
        target.write('query\tlineage_count\tspecies_1st_count\tspecies_2nd_count\tspecies_3rd_count\n')
        # iterate over records generator
        for record in records:
            target.write('%s' % record.query)
            try:
                num_hits, vc_species, vc_lineage = parse_record_alignments_taxonomy(record)
            except:
                pass
            try:
                target.write('\t%s (%s/%s)' % (vc_lineage.index[0], vc_lineage[0], num_hits))
            except:
                pass
            try:
                target.write('\t%s (%s/%s)' % (vc_species.index[0], vc_species[0], num_hits))
            except:
                pass
            try:
                target.write('\t%s (%s/%s)' % (vc_species.index[1], vc_species[1], num_hits))
            except:
                pass
            try:
                target.write('\t%s (%s/%s)' % (vc_species.index[2], vc_species[2], num_hits))
            except:
                pass
            target.write('\n')

Run for 90-bp sequences (top 500 by prevalence in 90-bp biom table)


In [2]:
path_xml = '../../data/sequence-lookup/rdp-taxonomy/otu_seqs_top_500_prev.emp_deblur_90bp.subset_2k.rare_5000.xml'
path_output = 'otu_seqs_top_500_prev.emp_deblur_90bp.subset_2k.rare_5000.tsv'
xml_to_taxonomy(path_xml, path_output)

Run for 100-bp sequences (top 500 by prevalence in 100-bp biom table)


In [3]:
path_xml = '../../data/sequence-lookup/rdp-taxonomy/otu_seqs_top_500_prev.emp_deblur_100bp.subset_2k.rare_5000.xml'
path_output = 'otu_seqs_top_500_prev.emp_deblur_100bp.subset_2k.rare_5000.tsv'
xml_to_taxonomy(path_xml, path_output)

Run for 150-bp sequences (top 500 by prevalence in 150-bp biom table)


In [4]:
path_xml = '../../data/sequence-lookup/rdp-taxonomy/otu_seqs_top_500_prev.emp_deblur_150bp.subset_2k.rare_5000.xml'
path_output = 'otu_seqs_top_500_prev.emp_deblur_150bp.subset_2k.rare_5000.tsv'
xml_to_taxonomy(path_xml, path_output)