This is some really ugly code illustrating how you could generate a biom table from a blast9 file. We're doing this for an experiment with RapSearch2 for read mapping, and are using this for testing purposes only. We'll need to get better support for this before using for real.


In [1]:
# This cell is not necessary when running the steps below on an existing file

s = """# Fields: Query Subject identity        aln-len mismatch        gap-openings    q.start q.end   s.start s.end   log(e-value)    bit-score
4472165.3.31762_6165	11846454	50	24	12	0	12	83	967	990	0.405978	27.72
4472165.3.31762_6165	11846454	90.9091	11	1	0	54	86	197	207	0.405978	28.11
4472165.3.31762_6165	11815348	57.6923	26	11	0	12	89	55	80	0.86	35.42
4472165.3.31762_6165	12307352	57.6923	26	11	0	12	89	1005	1030	0.86	35.42
4472165.3.31762_6165	12304087	57.6923	26	11	0	12	89	999	1024	0.86	35.42
4472165.3.31762_6165	12022820	57.6923	26	11	0	12	89	1721	1746	0.97	35.04
4472165.3.31762_6165	3545817	57.6923	26	11	0	12	89	418	443	0.97	35.04
4472165.3.31762_6165	11801263	57.6923	26	11	0	12	89	54	79	0.97	35.04
4472165.3.31762_10621	6133288	45.8333	24	13	0	75	4	35	58	0.86	35.42
4472165.3.31762_10621	12247112	45.8333	24	13	0	75	4	1318	1341	0.86	35.42
4472165.3.31762_10621	4381619	45.8333	24	13	0	75	4	423	446	0.98	35.04
4472165.4.31762_6165	11846454	50	24	12	0	12	83	967	990	0.405978	27.72
4472165.4.31762_6165	11846454	90.9091	11	1	0	54	86	197	207	0.405978	28.11
4472165.4.31762_6165	11815348	57.6923	26	11	0	12	89	55	80	0.86	35.42
4472165.4.31762_6165	12307352	57.6923	26	11	0	12	89	1005	1030	0.86	35.42
4472165.4.31762_6165	12304087	57.6923	26	11	0	12	89	999	1024	0.86	35.42
4472165.4.31762_6165	12022820	57.6923	26	11	0	12	89	1721	1746	0.97	35.04
4472165.4.31762_6165	3545817	57.6923	26	11	0	12	89	418	443	0.97	35.04
4472165.4.31762_6165	11801263	57.6923	26	11	0	12	89	54	79	0.97	35.04
4472165.4.31762_10621	6133288	45.8333	24	13	0	75	4	35	58	0.86	35.42
4472165.4.31762_10621	12247112	45.8333	24	13	0	75	4	1318	1341	0.86	35.42
4472165.4.31762_10621	4381619	45.8333	24	13	0	75	4	423	446	0.98	35.04"""

import tempfile

tf = tempfile.NamedTemporaryFile(delete=False)
tf.write(s)
tf.close()

In [2]:
# Update the minimum percent id and the MaxEvalue here - note that RapSearch2 is outputting 
# log(e-value). The MaxEvalue used here should be the log(e-value).

from brokit.blat import MinimalBlatParser9

max_log_evalue = 1.0
minimum_pct_id = 0.55

def bl9_to_observation_map(raw_output_fp, output_observation_map_fp, max_log_evalue, minimum_pct_id):
    """ Generate observation map from .bl9 file
    """
    result = {}
    pct_id_field = 2
    evalue_field = 10
    output_observation_map_f = open(output_observation_map_fp, 'w')
    last_observed_query_id = None
    for summary, blat_results in MinimalBlatParser9(open(raw_output_fp, 'U'), include_column_names=False):
        for e in blat_results:
            if (float(e[evalue_field]) <= max_log_evalue and
                float(e[pct_id_field]) / 100. >= minimum_pct_id):
                query_id = e[0]
                subject_id = e[1]
                if query_id == last_observed_query_id:
                    # we've observed a duplicate hit, ignore this one
                    continue
                else:
                    last_observed_query_id = query_id
                    try:
                        result[subject_id].append(query_id)
                    except KeyError:
                        result[subject_id] = [query_id]
    for e in result.items():
        output_observation_map_f.write(
            '%s\t%s\n' %
            (e[0], '\t'.join(e[1])))
    output_observation_map_f.close()
    return result

In [3]:
result = bl9_to_observation_map(tf.name, 'observation-map.txt', max_log_evalue, minimum_pct_id)

In [4]:
!cat observation-map.txt


11846454	4472165.3.31762_6165	4472165.4.31762_6165

In [5]:
!make_otu_table.py -i observation-map.txt -o table.biom

In [6]:
!biom summarize-table -i table.biom


Num samples: 2
Num observations: 1
Total count: 2
Table density (fraction of non-zero values): 1.000

Counts/sample summary:
 Min: 1.0
 Max: 1.0
 Median: 1.000
 Mean: 1.000
 Std. dev.: 0.000
 Sample Metadata Categories: 
 Observation Metadata Categories: 

Counts/sample detail:
 4472165.3.31762: 1.0
 4472165.4.31762: 1.0

In [6]: