In [3]:
import re
from collections import defaultdict
import itertools

import pandas as pd
import numpy as np


import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt


from matplotlib import collections as mc


/home/maubar/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [4]:
with open("blast_tsv_columns.txt") as fh:
    blast_cols = next(fh).rstrip(" \n").split(" ")
blast_cols


Out[4]:
['qseqid',
 'sseqid',
 'pident',
 'qcovs',
 'score',
 'qlen',
 'qstart',
 'qend',
 'slen',
 'sstart',
 'send',
 'length',
 'mismatch',
 'gapopen']

In [5]:
blast_hits = pd.read_csv("metaspades1_to_candidates_filt.tsv",header=None,names=blast_cols,sep="\t")

In [6]:
blast_hits.head()


Out[6]:
qseqid sseqid pident qcovs score qlen qstart qend slen sstart send length mismatch gapopen
0 NODE_25_length_30926_cov_18.7711 NODE_109_length_25355_cov_5.19581_ID_86627 99.996 77 47026 30926 1 23516 25355 23516 1 23516 1 0
1 NODE_203_length_12813_cov_28.6907 NODE_683_length_8295_cov_9.51971_ID_110378 100.000 50 12886 12813 1 6443 8295 1853 8295 6443 0 0
2 NODE_353_length_9780_cov_17.0422 NODE_141_length_22510_cov_15.0782_ID_182954 100.000 100 19560 9780 1 9780 22510 19220 9441 9780 0 0
3 NODE_687_length_6958_cov_21.6469 NODE_156_length_21389_cov_10.4886_ID_87504 100.000 100 13916 6958 1 6958 21389 11354 4397 6958 0 0
4 NODE_1451_length_4109_cov_49.7083 NODE_156_length_21389_cov_10.4886_ID_87504 100.000 100 8218 4109 1 4109 21389 14257 18365 4109 0 0

In [17]:
def plot_hits(df, sseqid,out_file):
    #Calculate reference line (subject sequence)
    main_line = [(0,0),(df.slen.iloc[0],0)]
    
    #Calculate lines of query hits, shifted with respect to their alignment to subject sequence
    full_queries = []
    aligned_queries = []
    labels = [(0,-0.05,sseqid)]
    for hit_number,(_,row) in enumerate(df.iterrows()):
        y_pos = (hit_number + 1) * 0.1
        #Positive strand
        if row["sstart"] < row["send"]:
            shifted_start = row["sstart"] - row["qstart"]
        #Negative strand
        else:
            shifted_start = row["sstart"] - row["qend"]

        #Full length of the query
        new_full_query = [(shifted_start,y_pos),(shifted_start+row["qlen"],y_pos)]
        #Segment of the query that aligns to the reference
        new_aligned_query = [(row["sstart"],y_pos),(row["send"],y_pos)]
        new_label = (shifted_start,y_pos,row["qseqid"])

        full_queries.append(new_full_query)
        aligned_queries.append(new_aligned_query)
        labels.append(new_label)
    
    
    pad_left_down = [(-df.slen.iloc[0]/4,-len(new_full_query)*0.2), (0,-len(new_full_query)*0.2)]
    pad_right_up  = [(df.slen.iloc[0],(1+len(new_full_query))*0.1), (5*df.slen.iloc[0]/4,(1+len(new_full_query))*0.1)]
    
    #Matplotlib plot
    mpl.rcParams['figure.figsize'] = (10.0, 8.0)
    fig,ax = plt.subplots()
    lc1 = mc.LineCollection([main_line],linewidths=10,colors=["blue"])
    lc2 = mc.LineCollection(full_queries,linewidths=3,colors=["black"])
    lc3 = mc.LineCollection(aligned_queries,linewidths=3,colors=["green"])
    pad = mc.LineCollection([pad_left_down,pad_right_up],linewidths=3,colors=["white"])
    
    ax.add_collection(lc1)
    ax.add_collection(lc2)
    ax.add_collection(lc3)
    ax.add_collection(pad)

    #Add labels
    for lbl in labels:
        ax.text(lbl[0],lbl[1]+0.01,lbl[2])

    #Set title
    ax.autoscale()
    ax.set_title(sseqid)
    
    fig.savefig(out_file)
    plt.close()

In [18]:
for sseqid,df in blast_hits.groupby("sseqid"):
    plot_hits(df,sseqid,"{}.png".format(sseqid))