In [1]:
!ls
In [2]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup
In [5]:
blast_file_regex = re.compile(r"(blast[np])_vs_([a-zA-Z0-9_]+).tsv")
blast_cols = ["query_id","subject_id","pct_id","ali_len","mism",
"gap_open","q_start","q_end","s_start","s_end",
"e_value","bitscore","q_len","s_len","s_gi",
"s_taxids","s_scinames","s_names","q_cov","s_description"
]
#blast_cols = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen sgi staxids sscinames scomnames qcovs stitle"
In [6]:
blast_hits = []
for blast_filename in glob("2_blast/*.tsv"):
tool_id,db_id = blast_file_regex.search(blast_filename).groups()
blast_hits.append( pd.read_csv(blast_filename,sep="\t",header=None,names=blast_cols) )
blast_hits[-1]["tool"] = tool_id
blast_hits[-1]["db"] = db_id
In [7]:
all_blast_hits = blast_hits[0]
for search_hits in blast_hits[1:]:
all_blast_hits = all_blast_hits.append(search_hits)
print(all_blast_hits.shape)
In [8]:
all_blast_hits.head()
Out[8]:
In [10]:
all_blast_hits.sort_values(by=["query_id","bitscore"],ascending=False).to_csv("2_blastp_hits.csv",index=False)