In [1]:
import pandas as pd
import re
In [2]:
! mkdir -p out/
In [3]:
blast_hits = pd.DataFrame.from_csv("d9539_asm_v1.2_orf_filt_blastp.csv")
print(blast_hits.shape)
hmmer_hits = pd.DataFrame.from_csv("d9539_asm_v1.2_orf_filt_hmmsearch_dom.csv")
print(hmmer_hits.shape)
In [5]:
other_blastp_hits = blast_hits[ (blast_hits.db == "nr") | (blast_hits.db == "hmp_pep")]
other_blastp_hits = other_blastp_hits.drop(["tool","q_id","mism","q_cov_calc","s_scinames","s_names","s_gi","s_taxids"],1)
#hmp database seesm to have duplicated seqs. Remove duplicated hits
print("Duplicate hits")
print(other_blastp_hits[other_blastp_hits.duplicated()][["query_id","subject_id","db"]])
other_blastp_hits = other_blastp_hits.drop_duplicates()
In [6]:
other_blastp_hits_filt = other_blastp_hits[other_blastp_hits.e_value < 1]
filtered_hits = []
for _,hits in other_blastp_hits_filt.groupby(["query_id","db"]):
top_bitscore_hit = hits.ix[hits["bitscore"].idxmax()]
filtered_hits.append ( hits[hits["bitscore"] > top_bitscore_hit["bitscore"] - 5 ])
filtered_hits_df = pd.concat(filtered_hits)
print(filtered_hits_df.groupby(["query_id","db"])["subject_id"].count())
best_blast_hits = filtered_hits_df[["query_id","db","s_description","pct_id","q_cov","s_description","q_len","s_len"]]
best_blast_hits.to_csv("out/filt_orf_best_blast_hits.csv",index=False)
best_blast_hits
Out[6]:
In [7]:
env_nr_hits = blast_hits[ (blast_hits.db == "env_nr") & (blast_hits.e_value < 1) ]
env_nr_hits["s_description"].apply(lambda x: x.split("[")[1]).drop_duplicates()
Out[7]:
Considering all hits are from marine metagenomes in env_nr, just keep the one with highest bitscore
In [8]:
env_nr_hits_filt = pd.DataFrame(hits.ix[hits.bitscore.idxmax()] for _,hits in env_nr_hits.groupby("query_id")).sort_values(by="q_cds_start")
env_nr_hits_filt.drop(["q_start","q_end","s_start","s_end","q_id","s_names","s_scinames","tool","mism","gap_open"],axis=1)
Out[8]:
In [22]:
metahit_2014 = blast_hits[blast_hits.db == "metahit_2014_pep"]
metahit_2014 = metahit_2014.drop(["q_start","q_end","s_start","s_end","q_id","s_names","s_scinames","s_gi","s_taxids","tool","mism","gap_open"],axis=1)
print(metahit_2014.shape)
In [13]:
metahit_2014_cols = ["gene_id","gene_name","gene_length","gene_completness",
"cohort_origin","phylum","genus","kegg","eggNOG",
"sample_freq","individual_freq","kegg_categories","eggnog_fx_categories","cohort_assembled"]
metahit_2014_annot = pd.read_csv("~/tmp/metahit_reanalysis/IGC.annotation_OF.summary",sep="\t",header=None,names=metahit_2014_cols)
print(metahit_2014_annot.shape)
metahit_2014_annot.head()
Out[13]:
In [14]:
#MetaHIT 2014
metahit_2014_df = pd.merge(metahit_2014, metahit_2014_annot, left_on="subject_id", right_on="gene_name",how="left")
print(metahit_2014_df.shape)
metahit_2014_df.head()
Out[14]:
In [17]:
print(metahit_2014_df.shape)
print(metahit_2014_df[ metahit_2014_df.phylum != "unknown" ].shape)
print(metahit_2014_df[ metahit_2014_df.genus != "unknown" ].shape)
In [20]:
metahit_2014_df[metahit_2014_df.kegg != "unknown"][["query_id","db","kegg","kegg_categories"]].drop_duplicates()
Out[20]:
In [21]:
metahit_2014_df[metahit_2014_df.eggNOG != "unknown"][["query_id","db","eggNOG","eggnog_fx_categories"]].drop_duplicates()
Out[21]:
In [23]:
gb = metahit_2014_df[["query_id","individual_freq"]].groupby("query_id")
max_individual_freq = pd.DataFrame([ df.ix[ df.individual_freq.idxmax() ] for _,df in gb ] )
gb = metahit_2014_df[["query_id","sample_freq"]].groupby("query_id")
max_sample_freq = pd.DataFrame([ df.ix[ df.sample_freq.idxmax() ] for _,df in gb ] )
pd.merge(max_individual_freq,max_sample_freq,on="query_id",how="outer")
Out[23]:
In [24]:
metahit_df = pd.DataFrame(hits.ix[hits.bitscore.idxmax()] for _,hits in metahit_hits.groupby("query_id")).sort_values(by="q_cds_start")
metahit_df[["query_id","pct_id","q_cov"]]
metahit_df[["query_id","pct_id","q_cov"]].to_csv("out/filt_orf_best_metahit_hits.csv",index=False)
In [ ]: