This is some really ugly code illustrating how you could generate a biom table from a blast9 file. We're doing this for an experiment with RapSearch2 for read mapping, and are using this for testing purposes only. We'll need to get better support for this before using for real.
In [1]:
# This cell is not necessary when running the steps below on an existing file
s = """# Fields: Query Subject identity aln-len mismatch gap-openings q.start q.end s.start s.end log(e-value) bit-score
4472165.3.31762_6165 11846454 50 24 12 0 12 83 967 990 0.405978 27.72
4472165.3.31762_6165 11846454 90.9091 11 1 0 54 86 197 207 0.405978 28.11
4472165.3.31762_6165 11815348 57.6923 26 11 0 12 89 55 80 0.86 35.42
4472165.3.31762_6165 12307352 57.6923 26 11 0 12 89 1005 1030 0.86 35.42
4472165.3.31762_6165 12304087 57.6923 26 11 0 12 89 999 1024 0.86 35.42
4472165.3.31762_6165 12022820 57.6923 26 11 0 12 89 1721 1746 0.97 35.04
4472165.3.31762_6165 3545817 57.6923 26 11 0 12 89 418 443 0.97 35.04
4472165.3.31762_6165 11801263 57.6923 26 11 0 12 89 54 79 0.97 35.04
4472165.3.31762_10621 6133288 45.8333 24 13 0 75 4 35 58 0.86 35.42
4472165.3.31762_10621 12247112 45.8333 24 13 0 75 4 1318 1341 0.86 35.42
4472165.3.31762_10621 4381619 45.8333 24 13 0 75 4 423 446 0.98 35.04
4472165.4.31762_6165 11846454 50 24 12 0 12 83 967 990 0.405978 27.72
4472165.4.31762_6165 11846454 90.9091 11 1 0 54 86 197 207 0.405978 28.11
4472165.4.31762_6165 11815348 57.6923 26 11 0 12 89 55 80 0.86 35.42
4472165.4.31762_6165 12307352 57.6923 26 11 0 12 89 1005 1030 0.86 35.42
4472165.4.31762_6165 12304087 57.6923 26 11 0 12 89 999 1024 0.86 35.42
4472165.4.31762_6165 12022820 57.6923 26 11 0 12 89 1721 1746 0.97 35.04
4472165.4.31762_6165 3545817 57.6923 26 11 0 12 89 418 443 0.97 35.04
4472165.4.31762_6165 11801263 57.6923 26 11 0 12 89 54 79 0.97 35.04
4472165.4.31762_10621 6133288 45.8333 24 13 0 75 4 35 58 0.86 35.42
4472165.4.31762_10621 12247112 45.8333 24 13 0 75 4 1318 1341 0.86 35.42
4472165.4.31762_10621 4381619 45.8333 24 13 0 75 4 423 446 0.98 35.04"""
import tempfile
tf = tempfile.NamedTemporaryFile(delete=False)
tf.write(s)
tf.close()
In [2]:
# Update the minimum percent id and the MaxEvalue here - note that RapSearch2 is outputting
# log(e-value). The MaxEvalue used here should be the log(e-value).
from brokit.blat import MinimalBlatParser9
max_log_evalue = 1.0
minimum_pct_id = 0.55
def bl9_to_observation_map(raw_output_fp, output_observation_map_fp, max_log_evalue, minimum_pct_id):
""" Generate observation map from .bl9 file
"""
result = {}
pct_id_field = 2
evalue_field = 10
output_observation_map_f = open(output_observation_map_fp, 'w')
last_observed_query_id = None
for summary, blat_results in MinimalBlatParser9(open(raw_output_fp, 'U'), include_column_names=False):
for e in blat_results:
if (float(e[evalue_field]) <= max_log_evalue and
float(e[pct_id_field]) / 100. >= minimum_pct_id):
query_id = e[0]
subject_id = e[1]
if query_id == last_observed_query_id:
# we've observed a duplicate hit, ignore this one
continue
else:
last_observed_query_id = query_id
try:
result[subject_id].append(query_id)
except KeyError:
result[subject_id] = [query_id]
for e in result.items():
output_observation_map_f.write(
'%s\t%s\n' %
(e[0], '\t'.join(e[1])))
output_observation_map_f.close()
return result
In [3]:
result = bl9_to_observation_map(tf.name, 'observation-map.txt', max_log_evalue, minimum_pct_id)
In [4]:
!cat observation-map.txt
In [5]:
!make_otu_table.py -i observation-map.txt -o table.biom
In [6]:
!biom summarize-table -i table.biom
In [6]: