author: lukethompson@gmail.com
date: 16 Nov 2016
language: Python 2.7
license: BSD3
Takes the XML output of blastn (query: Deblur OTU, database: RDP Release 11, percent ID: 100%), parses it, and creates a file with the query, top RDP lineage (with number of hits having that lineage over total hits), and top-3 RDP species (with number of hits having that species over total hits).
In [1]:
import pandas as pd
import numpy as np
import Bio.Blast.NCBIXML
from cStringIO import StringIO
from __future__ import print_function
In [5]:
# convert RDP-style lineage to Greengenes-style lineage
def rdp_lineage_to_gg(lineage):
d = {}
linlist = lineage.split(';')
for i in np.arange(0, len(linlist), 2):
d[linlist[i+1]] = linlist[i]
linstr = ''
for level in ['domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus']:
try:
linstr += level[0] + '__' + d[level].replace('"', '') + '; '
except:
linstr += level[0] + '__' + '; '
linstr = linstr[:-2]
return(linstr)
In [6]:
# parse blast xml record
def parse_record_alignments_taxonomy(record):
df = pd.DataFrame(columns=('strain', 'lineage'))
for alignment in record.alignments:
strain, lineage = alignment.hit_def.split(' ')
linstr = rdp_lineage_to_gg(lineage)
df = df.append({'strain': strain, 'lineage': linstr}, ignore_index=True)
df['species'] = [(x.split(' ')[0] + ' ' + x.split(' ')[1]).replace(';', '') for x in df.strain]
num_hits = df.shape[0]
vc_species = df.species.value_counts()
vc_lineage = df.lineage.value_counts()
return(num_hits, vc_species, vc_lineage)
In [7]:
# main function
def xml_to_taxonomy(path_xml, path_output):
# read file as single string, generate handle, and parse xml handle to records generator
with open(path_xml) as file:
str_xml = file.read()
handle_xml = StringIO(str_xml)
records = Bio.Blast.NCBIXML.parse(handle_xml)
# write top lineage and top 3 strains for each query
with open(path_output, 'w') as target:
# write header
target.write('query\tlineage_count\tspecies_1st_count\tspecies_2nd_count\tspecies_3rd_count\n')
# iterate over records generator
for record in records:
target.write('%s' % record.query)
try:
num_hits, vc_species, vc_lineage = parse_record_alignments_taxonomy(record)
except:
pass
try:
target.write('\t%s (%s/%s)' % (vc_lineage.index[0], vc_lineage[0], num_hits))
except:
pass
try:
target.write('\t%s (%s/%s)' % (vc_species.index[0], vc_species[0], num_hits))
except:
pass
try:
target.write('\t%s (%s/%s)' % (vc_species.index[1], vc_species[1], num_hits))
except:
pass
try:
target.write('\t%s (%s/%s)' % (vc_species.index[2], vc_species[2], num_hits))
except:
pass
target.write('\n')
In [2]:
path_xml = '../../data/sequence-lookup/rdp-taxonomy/otu_seqs_top_500_prev.emp_deblur_90bp.subset_2k.rare_5000.xml'
path_output = 'otu_seqs_top_500_prev.emp_deblur_90bp.subset_2k.rare_5000.tsv'
xml_to_taxonomy(path_xml, path_output)
In [3]:
path_xml = '../../data/sequence-lookup/rdp-taxonomy/otu_seqs_top_500_prev.emp_deblur_100bp.subset_2k.rare_5000.xml'
path_output = 'otu_seqs_top_500_prev.emp_deblur_100bp.subset_2k.rare_5000.tsv'
xml_to_taxonomy(path_xml, path_output)
In [4]:
path_xml = '../../data/sequence-lookup/rdp-taxonomy/otu_seqs_top_500_prev.emp_deblur_150bp.subset_2k.rare_5000.xml'
path_output = 'otu_seqs_top_500_prev.emp_deblur_150bp.subset_2k.rare_5000.tsv'
xml_to_taxonomy(path_xml, path_output)