In [1]:
import pandas as pd
import re

In [2]:
! mkdir -p out/

In [3]:
blast_hits = pd.DataFrame.from_csv("d9539_asm_v1.2_orf_filt_blastp.csv")
print(blast_hits.shape)
hmmer_hits = pd.DataFrame.from_csv("d9539_asm_v1.2_orf_filt_hmmsearch_dom.csv")
print(hmmer_hits.shape)


(200, 28)
(6, 23)

1. Analyze nr and hmp_pep hits


In [5]:
other_blastp_hits = blast_hits[ (blast_hits.db == "nr") | (blast_hits.db == "hmp_pep")]
other_blastp_hits = other_blastp_hits.drop(["tool","q_id","mism","q_cov_calc","s_scinames","s_names","s_gi","s_taxids"],1)
#hmp database seesm to have duplicated seqs. Remove duplicated hits
print("Duplicate hits")
print(other_blastp_hits[other_blastp_hits.duplicated()][["query_id","subject_id","db"]])
other_blastp_hits = other_blastp_hits.drop_duplicates()


Duplicate hits
              query_id        subject_id       db
25    d9539_asm_v1.2_5   HMPREF9136_2685  hmp_pep
175  d9539_asm_v1.2_29  HMPREF1059_03257  hmp_pep
186  d9539_asm_v1.2_29   HMPREF9141_0985  hmp_pep
192  d9539_asm_v1.2_29      BACEGG_02721  hmp_pep
194  d9539_asm_v1.2_29  HMPREF1018_02255  hmp_pep
196  d9539_asm_v1.2_29    BACCOPRO_03325  hmp_pep
221  d9539_asm_v1.2_22  HMPREF1059_03256  hmp_pep
238  d9539_asm_v1.2_22   HMPREF9141_0984  hmp_pep
240  d9539_asm_v1.2_22  HMPREF0991_01395  hmp_pep
294  d9539_asm_v1.2_11      BACEGG_02723  hmp_pep
301  d9539_asm_v1.2_11   HMPREF0645_1833  hmp_pep
306  d9539_asm_v1.2_11   HMPREF9141_0987  hmp_pep
309  d9539_asm_v1.2_11   HMPREF9145_2738  hmp_pep
311  d9539_asm_v1.2_11   HMPREF0860_1608  hmp_pep

2. Get blast hits for nr & hmp, getting the best hit for each ORF and any other hit within 5 bits of the best score for the ORF


In [6]:
other_blastp_hits_filt = other_blastp_hits[other_blastp_hits.e_value < 1]

filtered_hits = []
for _,hits in other_blastp_hits_filt.groupby(["query_id","db"]):
    top_bitscore_hit = hits.ix[hits["bitscore"].idxmax()]
    filtered_hits.append ( hits[hits["bitscore"] > top_bitscore_hit["bitscore"] - 5 ])
                                        

filtered_hits_df = pd.concat(filtered_hits)
print(filtered_hits_df.groupby(["query_id","db"])["subject_id"].count())
best_blast_hits = filtered_hits_df[["query_id","db","s_description","pct_id","q_cov","s_description","q_len","s_len"]]
best_blast_hits.to_csv("out/filt_orf_best_blast_hits.csv",index=False)
best_blast_hits


query_id           db     
d9539_asm_v1.2_11  hmp_pep    1
                   nr         2
d9539_asm_v1.2_22  hmp_pep    1
                   nr         1
d9539_asm_v1.2_29  hmp_pep    1
                   nr         2
d9539_asm_v1.2_30  nr         3
Name: subject_id, dtype: int64
Out[6]:
query_id db s_description pct_id q_cov s_description q_len s_len
293 d9539_asm_v1.2_11 hmp_pep BACEGG_02723 hypothetical protein [Bacteroides... 40.323 15 BACEGG_02723 hypothetical protein [Bacteroides... 421 368
292 d9539_asm_v1.2_11 nr hypothetical protein [Eel River basin pequenov... 27.404 48 hypothetical protein [Eel River basin pequenov... 421 277
295 d9539_asm_v1.2_11 nr hypothetical protein [Bacteroides eggerthii] 40.323 15 hypothetical protein [Bacteroides eggerthii] 421 368
220 d9539_asm_v1.2_22 hmp_pep HMPREF1059_03256 hypothetical protein [Parabac... 28.931 40 HMPREF1059_03256 hypothetical protein [Parabac... 361 284
215 d9539_asm_v1.2_22 nr putative replication initiation protein [Alist... 27.386 61 putative replication initiation protein [Alist... 361 320
174 d9539_asm_v1.2_29 hmp_pep HMPREF1059_03257 hypothetical protein [Parabac... 26.214 31 HMPREF1059_03257 hypothetical protein [Parabac... 647 538
167 d9539_asm_v1.2_29 nr putative major capsid protein [Eel River basin... 28.796 29 putative major capsid protein [Eel River basin... 647 523
168 d9539_asm_v1.2_29 nr putative uncharacterized protein [Alistipes fi... 31.746 27 putative uncharacterized protein [Alistipes fi... 647 338
134 d9539_asm_v1.2_30 nr hypothetical protein [Ruegeria sp. ANG-S4] 29.545 87 hypothetical protein [Ruegeria sp. ANG-S4] 97 308
135 d9539_asm_v1.2_30 nr OPT family small oligopeptide transporter, par... 37.313 64 OPT family small oligopeptide transporter, par... 97 725
136 d9539_asm_v1.2_30 nr cysteinyl-tRNA synthetase [Chlamydia trachomatis] 31.522 93 cysteinyl-tRNA synthetase [Chlamydia trachomatis] 97 531

2. Analyze env_nr hits


In [7]:
env_nr_hits = blast_hits[ (blast_hits.db == "env_nr") & (blast_hits.e_value < 1) ]
env_nr_hits["s_description"].apply(lambda x: x.split("[")[1]).drop_duplicates()


Out[7]:
169    marine metagenome]
Name: s_description, dtype: object

Considering all hits are from marine metagenomes in env_nr, just keep the one with highest bitscore


In [8]:
env_nr_hits_filt = pd.DataFrame(hits.ix[hits.bitscore.idxmax()] for _,hits in env_nr_hits.groupby("query_id")).sort_values(by="q_cds_start")
env_nr_hits_filt.drop(["q_start","q_end","s_start","s_end","q_id","s_names","s_scinames","tool","mism","gap_open"],axis=1)


Out[8]:
query_id subject_id pct_id ali_len e_value bitscore q_len s_len s_gi s_taxids q_cov s_description db orf_len q_cds_start q_cds_end strand q_cov_calc
312 d9539_asm_v1.2_11 gi|135497532|gb|EBI02878.1| 29.752 121 9.000000e-01 36.2 421 126 135497532 408172.0 29 hypothetical protein GOS_9161653, partial [mar... env_nr 421.0 393 1655 + 28.741093
272 d9539_asm_v1.2_14 gi|138220043|gb|EBY50077.1| 32.222 90 7.100000e-01 36.2 174 315 138220043 408172.0 51 hypothetical protein GOS_5591846, partial [mar... env_nr 174.0 1603 2124 + 51.149425
225 d9539_asm_v1.2_22 gi|142009119|gb|ECU79481.1| 25.000 252 3.000000e-07 57.8 361 295 142009119 408172.0 64 hypothetical protein GOS_10492, partial [marin... env_nr 361.0 2351 3433 + 63.711911
169 d9539_asm_v1.2_29 gi|142008696|gb|ECU79166.1| 24.706 255 6.490000e-08 62.0 647 505 142008696 408172.0 39 hypothetical protein GOS_10803 [marine metagen... env_nr 647.0 3429 5369 + 38.639876

3. Analyse MetaHIT hits


In [22]:
metahit_2014 = blast_hits[blast_hits.db == "metahit_2014_pep"]
metahit_2014 = metahit_2014.drop(["q_start","q_end","s_start","s_end","q_id","s_names","s_scinames","s_gi","s_taxids","tool","mism","gap_open"],axis=1)
print(metahit_2014.shape)


(81, 16)

a. Load MetaHIT 2014 annotation


In [13]:
metahit_2014_cols = ["gene_id","gene_name","gene_length","gene_completness",
                     "cohort_origin","phylum","genus","kegg","eggNOG",
                    "sample_freq","individual_freq","kegg_categories","eggnog_fx_categories","cohort_assembled"]

metahit_2014_annot = pd.read_csv("~/tmp/metahit_reanalysis/IGC.annotation_OF.summary",sep="\t",header=None,names=metahit_2014_cols)

print(metahit_2014_annot.shape)
metahit_2014_annot.head()


(9879896, 14)
Out[13]:
gene_id gene_name gene_length gene_completness cohort_origin phylum genus kegg eggNOG sample_freq individual_freq kegg_categories eggnog_fx_categories cohort_assembled
0 1 T2D-6A_GL0083352 88230 Complete CHN unknown unknown K01824 COG5184 0.224152 0.236449 Lipid Metabolism Cell cycle control, cell division, chromosome ... EUR;CHN;USA
1 2 V1.UC4-5_GL0154511 88086 Complete EUR unknown unknown K01824 COG5184 0.352802 0.351402 Lipid Metabolism Cell cycle control, cell division, chromosome ... EUR;CHN;USA
2 3 MH0198_GL0136826 67464 Lack 5'-end EUR unknown unknown K01824 COG5184 0.281768 0.293458 Lipid Metabolism Cell cycle control, cell division, chromosome ... EUR;CHN
3 4 V1.FI28_GL0106390 45267 Complete EUR unknown unknown K03924 NOG12793 0.278611 0.267290 Metabolism Function unknown EUR;USA
4 5 MH0318_GL0144550 42243 Complete EUR unknown unknown K13730 COG4886 0.001579 0.001869 Infectious Diseases Function unknown EUR

b. Annotate hits


In [14]:
#MetaHIT 2014
metahit_2014_df = pd.merge(metahit_2014, metahit_2014_annot, left_on="subject_id", right_on="gene_name",how="left")
print(metahit_2014_df.shape)
metahit_2014_df.head()


(81, 30)
Out[14]:
query_id subject_id pct_id ali_len e_value bitscore q_len s_len q_cov s_description ... cohort_origin phylum genus kegg eggNOG sample_freq individual_freq kegg_categories eggnog_fx_categories cohort_assembled
0 d9539_asm_v1.2_5 MH0409_GL0061975 91.379 58 1.220000e-31 113.0 58 86 100 MH0409_GL0061975 [gene] locus=scaffold117357... ... EUR unknown unknown unknown unknown 0.005525 0.005607 unknown unknown EUR
1 d9539_asm_v1.2_5 675950834-stool1_revised_C814893_1_gene71376 89.655 58 1.970000e-31 112.0 58 58 100 675950834-stool1_revised_C814893_1_gene71376 s... ... USA unknown unknown unknown unknown 0.018942 0.014953 unknown unknown USA
2 d9539_asm_v1.2_5 MH0105_GL0047475 89.655 58 2.880000e-31 112.0 58 86 100 MH0105_GL0047475 [gene] locus=scaffold1014_1... ... EUR unknown unknown unknown unknown 0.007103 0.008411 unknown unknown EUR
3 d9539_asm_v1.2_5 MH0203_GL0004312 89.655 58 3.250000e-31 112.0 58 86 100 MH0203_GL0004312 [gene] locus=C2075218_1:153... ... EUR unknown unknown unknown unknown 0.007893 0.009346 unknown unknown EUR
4 d9539_asm_v1.2_5 MH0033_GL0005948 89.655 58 4.350000e-31 112.0 58 86 100 MH0033_GL0005948 [gene] locus=scaffold47970_... ... EUR unknown unknown unknown unknown 0.011050 0.012150 unknown unknown EUR

5 rows × 30 columns

c. Tax annotation

No results


In [17]:
print(metahit_2014_df.shape)
print(metahit_2014_df[ metahit_2014_df.phylum != "unknown" ].shape)
print(metahit_2014_df[ metahit_2014_df.genus != "unknown" ].shape)


(81, 30)
(0, 30)
(0, 30)

d. KEGG


In [20]:
metahit_2014_df[metahit_2014_df.kegg != "unknown"][["query_id","db","kegg","kegg_categories"]].drop_duplicates()


Out[20]:
query_id db kegg kegg_categories
28 d9539_asm_v1.2_3 metahit_2014_pep K00936 Metabolism
29 d9539_asm_v1.2_3 metahit_2014_pep K02058 Membrane Transport

e. eggNOG


In [21]:
metahit_2014_df[metahit_2014_df.eggNOG != "unknown"][["query_id","db","eggNOG","eggnog_fx_categories"]].drop_duplicates()


Out[21]:
query_id db eggNOG eggnog_fx_categories
28 d9539_asm_v1.2_3 metahit_2014_pep COG0642 Signal transduction mechanisms
29 d9539_asm_v1.2_3 metahit_2014_pep COG1879 Carbohydrate transport and metabolism

f. Max observed freqs of ORFs 'homologues' in MetaHIT samples


In [23]:
gb = metahit_2014_df[["query_id","individual_freq"]].groupby("query_id")
max_individual_freq = pd.DataFrame([ df.ix[ df.individual_freq.idxmax() ] for _,df in gb ] )

gb = metahit_2014_df[["query_id","sample_freq"]].groupby("query_id")
max_sample_freq = pd.DataFrame([ df.ix[ df.sample_freq.idxmax() ] for _,df in gb ] )

pd.merge(max_individual_freq,max_sample_freq,on="query_id",how="outer")


Out[23]:
query_id individual_freq sample_freq
0 d9539_asm_v1.2_1 0.035514 0.037885
1 d9539_asm_v1.2_11 0.018692 0.022099
2 d9539_asm_v1.2_14 0.014953 0.019732
3 d9539_asm_v1.2_16 0.006542 0.007103
4 d9539_asm_v1.2_22 0.043925 0.052092
5 d9539_asm_v1.2_29 0.009346 0.011050
6 d9539_asm_v1.2_3 0.066355 0.063141
7 d9539_asm_v1.2_30 0.037383 0.037885
8 d9539_asm_v1.2_5 0.024299 0.024467

f. Write out best MetaHIT hit for each ORF


In [24]:
metahit_df = pd.DataFrame(hits.ix[hits.bitscore.idxmax()] for _,hits in metahit_hits.groupby("query_id")).sort_values(by="q_cds_start")
metahit_df[["query_id","pct_id","q_cov"]]
metahit_df[["query_id","pct_id","q_cov"]].to_csv("out/filt_orf_best_metahit_hits.csv",index=False)

In [ ]: