notebook.community

Edit and run



In [3]:

    
import re
from collections import defaultdict
import itertools

import pandas as pd
import numpy as np


import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt


from matplotlib import collections as mc









    



/home/maubar/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [4]:

    
with open("blast_tsv_columns.txt") as fh:
    blast_cols = next(fh).rstrip(" \n").split(" ")
blast_cols









    Out[4]:





['qseqid',
 'sseqid',
 'pident',
 'qcovs',
 'score',
 'qlen',
 'qstart',
 'qend',
 'slen',
 'sstart',
 'send',
 'length',
 'mismatch',
 'gapopen']



In [5]:

    
blast_hits = pd.read_csv("metaspades1_to_candidates_filt.tsv",header=None,names=blast_cols,sep="\t")



In [6]:

    
blast_hits.head()









    Out[6]:






  
    
      
      qseqid
      sseqid
      pident
      qcovs
      score
      qlen
      qstart
      qend
      slen
      sstart
      send
      length
      mismatch
      gapopen
    
  
  
    
      0
      NODE_25_length_30926_cov_18.7711
      NODE_109_length_25355_cov_5.19581_ID_86627
      99.996
      77
      47026
      30926
      1
      23516
      25355
      23516
      1
      23516
      1
      0
    
    
      1
      NODE_203_length_12813_cov_28.6907
      NODE_683_length_8295_cov_9.51971_ID_110378
      100.000
      50
      12886
      12813
      1
      6443
      8295
      1853
      8295
      6443
      0
      0
    
    
      2
      NODE_353_length_9780_cov_17.0422
      NODE_141_length_22510_cov_15.0782_ID_182954
      100.000
      100
      19560
      9780
      1
      9780
      22510
      19220
      9441
      9780
      0
      0
    
    
      3
      NODE_687_length_6958_cov_21.6469
      NODE_156_length_21389_cov_10.4886_ID_87504
      100.000
      100
      13916
      6958
      1
      6958
      21389
      11354
      4397
      6958
      0
      0
    
    
      4
      NODE_1451_length_4109_cov_49.7083
      NODE_156_length_21389_cov_10.4886_ID_87504
      100.000
      100
      8218
      4109
      1
      4109
      21389
      14257
      18365
      4109
      0
      0



In [17]:

    
def plot_hits(df, sseqid,out_file):
    #Calculate reference line (subject sequence)
    main_line = [(0,0),(df.slen.iloc[0],0)]
    
    #Calculate lines of query hits, shifted with respect to their alignment to subject sequence
    full_queries = []
    aligned_queries = []
    labels = [(0,-0.05,sseqid)]
    for hit_number,(_,row) in enumerate(df.iterrows()):
        y_pos = (hit_number + 1) * 0.1
        #Positive strand
        if row["sstart"] < row["send"]:
            shifted_start = row["sstart"] - row["qstart"]
        #Negative strand
        else:
            shifted_start = row["sstart"] - row["qend"]

        #Full length of the query
        new_full_query = [(shifted_start,y_pos),(shifted_start+row["qlen"],y_pos)]
        #Segment of the query that aligns to the reference
        new_aligned_query = [(row["sstart"],y_pos),(row["send"],y_pos)]
        new_label = (shifted_start,y_pos,row["qseqid"])

        full_queries.append(new_full_query)
        aligned_queries.append(new_aligned_query)
        labels.append(new_label)
    
    
    pad_left_down = [(-df.slen.iloc[0]/4,-len(new_full_query)*0.2), (0,-len(new_full_query)*0.2)]
    pad_right_up  = [(df.slen.iloc[0],(1+len(new_full_query))*0.1), (5*df.slen.iloc[0]/4,(1+len(new_full_query))*0.1)]
    
    #Matplotlib plot
    mpl.rcParams['figure.figsize'] = (10.0, 8.0)
    fig,ax = plt.subplots()
    lc1 = mc.LineCollection([main_line],linewidths=10,colors=["blue"])
    lc2 = mc.LineCollection(full_queries,linewidths=3,colors=["black"])
    lc3 = mc.LineCollection(aligned_queries,linewidths=3,colors=["green"])
    pad = mc.LineCollection([pad_left_down,pad_right_up],linewidths=3,colors=["white"])
    
    ax.add_collection(lc1)
    ax.add_collection(lc2)
    ax.add_collection(lc3)
    ax.add_collection(pad)

    #Add labels
    for lbl in labels:
        ax.text(lbl[0],lbl[1]+0.01,lbl[2])

    #Set title
    ax.autoscale()
    ax.set_title(sseqid)
    
    fig.savefig(out_file)
    plt.close()



In [18]:

    
for sseqid,df in blast_hits.groupby("sseqid"):
    plot_hits(df,sseqid,"{}.png".format(sseqid))

	qseqid	sseqid	pident	qcovs	score	qlen	qstart	qend	slen	sstart	send	length	mismatch
0	NODE_25_length_30926_cov_18.7711	NODE_109_length_25355_cov_5.19581_ID_86627	99.996	77	47026	30926	1	23516	25355	23516	1	23516	1
1	NODE_203_length_12813_cov_28.6907	NODE_683_length_8295_cov_9.51971_ID_110378	100.000	50	12886	12813	1	6443	8295	1853	8295	6443	0
2	NODE_353_length_9780_cov_17.0422	NODE_141_length_22510_cov_15.0782_ID_182954	100.000	100	19560	9780	1	9780	22510	19220	9441	9780	0
3	NODE_687_length_6958_cov_21.6469	NODE_156_length_21389_cov_10.4886_ID_87504	100.000	100	13916	6958	1	6958	21389	11354	4397	6958	0
4	NODE_1451_length_4109_cov_49.7083	NODE_156_length_21389_cov_10.4886_ID_87504	100.000	100	8218	4109	1	4109	21389	14257	18365	4109	0