In [1]:
!ls


File inspection.ipynb  PAXhs_vs_Pw-aa.blastp.txt  PAX.Xenopus.aa.fasta
pax7_start_hits.bed    PAXhs_vs_Pw.tblastn.txt	  SOX_vs_Pw.tblastn.txt
PAX.Hsapiens.aa.fasta  PAX_vs_Pw.tblastn.txt	  SOX.Xenopus.aa.fasta

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import itertools
from matplotlib import collections as mc
import pylab
import numpy as np

1. Read Tblastn hits from the human PAX proteins to the salamander scaffolds


In [4]:
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism","gap_open","q_start","q_end","s_start","s_end","e_value","bitscore"]
pax_hits = pd.read_csv("PAXhs_vs_Pw.tblastn.txt",sep="\t",header=None,names=blast_cols)
print( "Size of dataframe: {}".format(pax_hits.shape ))
pax_hits.head()


Size of dataframe: (404, 12)
Out[4]:
query_id subject_id pct_id ali_len mism gap_open q_start q_end s_start s_end e_value bitscore
0 PAX1_Hsapiens Pw_938150 87.85 214 21 4 96 306 1450 815 1.000000e-113 372.0
1 PAX1_Hsapiens Pw_1163203 74.70 83 21 0 92 174 3367 3119 1.000000e-27 127.0
2 PAX1_Hsapiens Pw_1163203 62.79 43 16 0 171 213 752 624 1.000000e-08 66.6
3 PAX1_Hsapiens Pw_2510707 70.67 75 22 0 141 215 490 266 1.000000e-21 107.0
4 PAX1_Hsapiens Pw_2510707 73.81 42 11 0 102 143 1354 1229 2.000000e-08 65.9

2. Calculate protein lenghts from human PAX proteins fasta


In [5]:
def get_seq_len_from_fasta(fasta_file):
    seqs = defaultdict(int)
    current_id = ""
    with open(fasta_file) as fh:
        for line in fh:
            if line.startswith(">"):
                current_id = line.lstrip(">").rstrip("\n")
            else:
                seqs[current_id] += len(line.rstrip("\n"))
    return seqs

pax_lengths = get_seq_len_from_fasta("PAX.Hsapiens.aa.fasta")

In [6]:
print(pax_lengths)


defaultdict(<class 'int'>, {'PAX3_Hsapiens': 479, 'PAX1_Hsapiens': 534, 'PAX2_Hsapiens': 417, 'PAX6_Hsapiens': 422, 'PAX4_Hsapiens': 350, 'PAX8_Hsapiens': 450, 'PAX9_Hsapiens': 341, 'PAX5_Hsapiens': 391, 'PAX7_Hsapiens': 505})

In [7]:
print(pax_hits.shape)
print(pax_hits[pax_hits.e_value < 1e-10].shape)


(404, 12)
(220, 12)

In [8]:
pax_hits.groupby("query_id")["subject_id"].size()


Out[8]:
query_id
PAX1_Hsapiens    27
PAX2_Hsapiens    35
PAX3_Hsapiens    80
PAX4_Hsapiens    28
PAX5_Hsapiens    37
PAX6_Hsapiens    58
PAX7_Hsapiens    81
PAX8_Hsapiens    30
PAX9_Hsapiens    28
dtype: int64

In [9]:
lol = pax_hits.groupby("query_id")["subject_id"].unique()
lol.apply(lambda x:len(x))


Out[9]:
query_id
PAX1_Hsapiens    21
PAX2_Hsapiens    28
PAX3_Hsapiens    72
PAX4_Hsapiens    26
PAX5_Hsapiens    28
PAX6_Hsapiens    48
PAX7_Hsapiens    72
PAX8_Hsapiens    22
PAX9_Hsapiens    22
Name: subject_id, dtype: int64

In [10]:
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

gene_name = "PAX7_Hsapiens"
df = pax_hits[pax_hits.query_id == gene_name]
#Order contigs by query start
contig_idx = dict( (ctg_id, idx+1 ) for idx,ctg_id in enumerate(
            df[["subject_id","q_start"]].sort_values(by="q_start").drop_duplicates("subject_id",keep="first").subject_id 
                  ))
lines = [ [(q_st,contig_idx[s_id]),(q_end,contig_idx[s_id])] for s_id,q_st,q_end in 
df[["subject_id","q_start","q_end"]].sort_values(by="q_start").apply(lambda row: tuple(row),axis=1)
]
lc1 = mc.LineCollection(lines[0::3],linewidths=2,color="red")
lc2 = mc.LineCollection(lines[1::3],linewidths=2,color="blue")
lc3 = mc.LineCollection(lines[2::3],linewidths=2,color="green")
fig,ax = plt.subplots()
ax.add_collection(lc1)
ax.add_collection(lc2)
ax.add_collection(lc3)
ax.axvline(pax_lengths[gene_name])
ax.autoscale()


Hit extraction


In [26]:
!mkdir out/
#Filter by gene
df = pax_hits[pax_hits.query_id == "PAX7_Hsapiens"]
#Filter by gene region
df = df[(df.q_start > 180) & (df.q_start < 250)]
#Write to file
df[["subject_id","s_start","s_end"]].to_csv("out/pax7_middle_hits.bed",sep="\t",index=None,header=False)
#Print table
df.sort_values(by="q_start").head()


Out[26]:
query_id subject_id pct_id ali_len mism gap_open q_start q_end s_start s_end e_value bitscore
270 PAX7_Hsapiens Pw_1123873 78.21 78 17 0 186 263 4842 5075 7.000000e-28 127.0
303 PAX7_Hsapiens Pw_422920 53.75 80 29 3 188 263 1083 1310 4.000000e-12 77.8
304 PAX7_Hsapiens Pw_124418 53.75 80 29 3 188 263 29868 30095 4.000000e-12 77.8
333 PAX7_Hsapiens Pw_391938 35.63 87 56 0 189 275 4056 3796 7.000000e-07 60.8
321 PAX7_Hsapiens Pw_2083840 59.02 61 22 1 189 249 3388 3561 1.000000e-08 66.6
334 PAX7_Hsapiens Pw_68547 35.63 87 56 0 189 275 4871 5131 8.000000e-07 60.8
298 PAX7_Hsapiens Pw_1176810 44.44 117 57 2 192 303 725 1066 4.000000e-14 82.8
342 PAX7_Hsapiens Pw_6015308 41.03 78 43 2 192 269 453 677 6.000000e-06 56.6
338 PAX7_Hsapiens Pw_2848915 32.61 92 60 1 194 283 217 492 4.000000e-06 58.2
316 PAX7_Hsapiens Pw_784005 44.44 81 38 1 195 275 1266 1045 4.000000e-09 68.2
268 PAX7_Hsapiens Pw_977326 83.33 84 7 1 196 272 1290 1039 6.000000e-32 139.0
308 PAX7_Hsapiens Pw_2885311 56.92 65 28 0 199 263 3001 3195 1.000000e-11 76.3
317 PAX7_Hsapiens Pw_1702257 58.46 65 17 2 199 263 5993 5829 4.000000e-09 68.2
307 PAX7_Hsapiens Pw_3292658 63.08 65 23 1 200 263 535 341 8.000000e-12 76.6
343 PAX7_Hsapiens Pw_1940827 38.16 76 46 1 201 275 2946 2719 7.000000e-06 57.8
339 PAX7_Hsapiens Pw_1881471 34.52 84 53 1 202 283 1865 1614 5.000000e-06 58.2
329 PAX7_Hsapiens Pw_1180651 53.23 62 29 0 204 265 4265 4080 4.000000e-07 62.0
310 PAX7_Hsapiens Pw_2081599 70.00 50 15 0 214 263 8839 8690 3.000000e-10 72.0
302 PAX7_Hsapiens Pw_2513685 65.62 64 22 0 214 277 2294 2103 1.000000e-12 79.7
300 PAX7_Hsapiens Pw_983623 74.00 50 13 0 214 263 4157 4306 3.000000e-13 81.6
299 PAX7_Hsapiens Pw_4659549 76.00 50 12 0 214 263 1124 1273 1.000000e-13 81.6
324 PAX7_Hsapiens Pw_1256622 62.00 50 19 0 214 263 3838 3987 9.000000e-08 63.9
336 PAX7_Hsapiens Pw_34456 61.22 49 19 0 215 263 12951 13097 1.000000e-06 60.8
335 PAX7_Hsapiens Pw_2273486 61.22 49 19 0 215 263 11698 11552 9.000000e-07 60.8
332 PAX7_Hsapiens Pw_1809758 57.14 49 21 0 215 263 282 136 6.000000e-07 61.2
344 PAX7_Hsapiens Pw_814160 43.33 60 34 0 215 274 1071 1250 7.000000e-06 55.1
325 PAX7_Hsapiens Pw_1613384 61.22 49 19 0 215 263 2459 2605 1.000000e-07 63.5
323 PAX7_Hsapiens Pw_966742 60.38 53 20 1 215 266 339 181 2.000000e-08 63.9
311 PAX7_Hsapiens Pw_1425627 71.43 49 14 0 215 263 10523 10669 3.000000e-10 72.0
320 PAX7_Hsapiens Pw_5071985 70.83 48 14 0 216 263 1534 1391 1.000000e-08 66.2
318 PAX7_Hsapiens Pw_4062193 64.58 48 17 0 216 263 3343 3200 5.000000e-09 67.8
315 PAX7_Hsapiens Pw_1149043 46.88 64 34 0 216 279 2051 2242 3.000000e-09 68.6
313 PAX7_Hsapiens Pw_2519037 46.88 64 34 0 216 279 854 663 2.000000e-09 68.9
331 PAX7_Hsapiens Pw_3373424 63.83 47 17 0 217 263 935 795 6.000000e-07 60.8
341 PAX7_Hsapiens Pw_4884012 40.79 76 42 2 218 292 1460 1239 5.000000e-06 57.8
322 PAX7_Hsapiens Pw_3343250 65.22 46 16 0 218 263 587 450 2.000000e-08 65.5
337 PAX7_Hsapiens Pw_284865 65.12 43 15 0 221 263 9915 9787 1.000000e-06 60.5
327 PAX7_Hsapiens Pw_156739 67.44 43 14 0 221 263 1400 1528 2.000000e-07 62.8
326 PAX7_Hsapiens Pw_1836946 66.67 42 14 0 222 263 4526 4651 2.000000e-07 63.2
319 PAX7_Hsapiens Pw_416205 64.44 45 16 0 222 266 4610 4744 6.000000e-09 67.8
330 PAX7_Hsapiens Pw_2534669 67.50 40 13 0 224 263 3665 3546 5.000000e-07 61.6

Greedy strategy to extract longest contigs for each gene


In [12]:
def calculate_cov(contig_set,gene_length):
    bitmap = np.zeros(gene_length)
    for hit in contig_set:
        bitmap[hit[1]-1:hit[2]] = 1
    return bitmap.sum()

In [14]:
gene_name = "PAX6_Hsapiens"
df = pax_hits[pax_hits.query_id == gene_name]

#Extract lists of hits for protein
hits_list = df[["subject_id","q_start","q_end"]].apply(lambda row:tuple(row),axis=1).tolist()

#Greedy Algorithm starts here
contig_set = tuple()
last_it_cvg= 0
it_cvg_delta = 11
min_cov_increase = 10

while hits_list and (it_cvg_delta > min_cov_increase):
    it_best_hit = tuple()
    it_best_cvg = 0
    #Try which combination of current contig_set + a hit gives the largest coverage increase
    for hit in hits_list:
        cvg = calculate_cov( contig_set + (hit,) , pax_lengths[gene_name])
        if cvg > it_best_cvg:
            it_best_hit = hit
            it_best_cvg = cvg
        
    print("Max_it_cvg : {}".format(it_best_cvg))
    #Calculate coverage increase
    it_cvg_delta = it_best_cvg - last_it_cvg
    #IF delta if better than the minimun, add the hit
    if  it_cvg_delta > min_cov_increase:
        last_it_cvg = it_best_cvg
        contig_set = contig_set + (it_best_hit,)
        hits_list.remove(hit)

print(contig_set)


Max_it_cvg : 138.0
Max_it_cvg : 206.0
Max_it_cvg : 255.0
Max_it_cvg : 299.0
Max_it_cvg : 336.0
Max_it_cvg : 370.0
Max_it_cvg : 373.0
(('Pw_938150', 2, 139), ('Pw_2885311', 189, 256), ('Pw_577196', 256, 305), ('Pw_3748617', 343, 386), ('Pw_577196', 306, 344), ('Pw_3948717', 120, 173))