In [1]:
!ls
In [2]:
%matplotlib inline
In [3]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import itertools
from matplotlib import collections as mc
import pylab
import numpy as np
In [4]:
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism","gap_open","q_start","q_end","s_start","s_end","e_value","bitscore"]
pax_hits = pd.read_csv("PAXhs_vs_Pw.tblastn.txt",sep="\t",header=None,names=blast_cols)
print( "Size of dataframe: {}".format(pax_hits.shape ))
pax_hits.head()
Out[4]:
In [5]:
def get_seq_len_from_fasta(fasta_file):
seqs = defaultdict(int)
current_id = ""
with open(fasta_file) as fh:
for line in fh:
if line.startswith(">"):
current_id = line.lstrip(">").rstrip("\n")
else:
seqs[current_id] += len(line.rstrip("\n"))
return seqs
pax_lengths = get_seq_len_from_fasta("PAX.Hsapiens.aa.fasta")
In [6]:
print(pax_lengths)
In [7]:
print(pax_hits.shape)
print(pax_hits[pax_hits.e_value < 1e-10].shape)
In [8]:
pax_hits.groupby("query_id")["subject_id"].size()
Out[8]:
In [9]:
lol = pax_hits.groupby("query_id")["subject_id"].unique()
lol.apply(lambda x:len(x))
Out[9]:
In [10]:
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
gene_name = "PAX7_Hsapiens"
df = pax_hits[pax_hits.query_id == gene_name]
#Order contigs by query start
contig_idx = dict( (ctg_id, idx+1 ) for idx,ctg_id in enumerate(
df[["subject_id","q_start"]].sort_values(by="q_start").drop_duplicates("subject_id",keep="first").subject_id
))
lines = [ [(q_st,contig_idx[s_id]),(q_end,contig_idx[s_id])] for s_id,q_st,q_end in
df[["subject_id","q_start","q_end"]].sort_values(by="q_start").apply(lambda row: tuple(row),axis=1)
]
lc1 = mc.LineCollection(lines[0::3],linewidths=2,color="red")
lc2 = mc.LineCollection(lines[1::3],linewidths=2,color="blue")
lc3 = mc.LineCollection(lines[2::3],linewidths=2,color="green")
fig,ax = plt.subplots()
ax.add_collection(lc1)
ax.add_collection(lc2)
ax.add_collection(lc3)
ax.axvline(pax_lengths[gene_name])
ax.autoscale()
In [26]:
!mkdir out/
#Filter by gene
df = pax_hits[pax_hits.query_id == "PAX7_Hsapiens"]
#Filter by gene region
df = df[(df.q_start > 180) & (df.q_start < 250)]
#Write to file
df[["subject_id","s_start","s_end"]].to_csv("out/pax7_middle_hits.bed",sep="\t",index=None,header=False)
#Print table
df.sort_values(by="q_start").head()
Out[26]:
In [12]:
def calculate_cov(contig_set,gene_length):
bitmap = np.zeros(gene_length)
for hit in contig_set:
bitmap[hit[1]-1:hit[2]] = 1
return bitmap.sum()
In [14]:
gene_name = "PAX6_Hsapiens"
df = pax_hits[pax_hits.query_id == gene_name]
#Extract lists of hits for protein
hits_list = df[["subject_id","q_start","q_end"]].apply(lambda row:tuple(row),axis=1).tolist()
#Greedy Algorithm starts here
contig_set = tuple()
last_it_cvg= 0
it_cvg_delta = 11
min_cov_increase = 10
while hits_list and (it_cvg_delta > min_cov_increase):
it_best_hit = tuple()
it_best_cvg = 0
#Try which combination of current contig_set + a hit gives the largest coverage increase
for hit in hits_list:
cvg = calculate_cov( contig_set + (hit,) , pax_lengths[gene_name])
if cvg > it_best_cvg:
it_best_hit = hit
it_best_cvg = cvg
print("Max_it_cvg : {}".format(it_best_cvg))
#Calculate coverage increase
it_cvg_delta = it_best_cvg - last_it_cvg
#IF delta if better than the minimun, add the hit
if it_cvg_delta > min_cov_increase:
last_it_cvg = it_best_cvg
contig_set = contig_set + (it_best_hit,)
hits_list.remove(hit)
print(contig_set)