In [1]:
import pandas as pd
import re
from glob import glob
import requests
from bs4 import BeautifulSoup

Load blast hits


In [3]:
#Load blastn hits
blastn_hits = pd.read_csv("blastn_hits.csv")

1. Analyze blastn hits

1.1 Extract best env_nt hits to perform a genome MSA

The main goal is to perform an MSA between our D9539 assembly and the similar seqs in the dbs


In [4]:
#List of sequences to extract
seqs_for_msa = blastn_hits[blastn_hits.db == "env_nt"].sort_values(by="ali_len",ascending=False).head(n=10)
#Export megahit ids to extract directly from fasta : Empty!
#seqs_for_msa[seqs_for_msa.db == "hmp_nuc"]["subject_id"].to_csv("d9539_hmp_homologs.txt",sep="\t",index=False,header=False)

Obtain fastas for env_nt homologs from eutils


In [12]:
#Use efetch to extract and save to a file the fsta with the sequences
gis_to_get = ",".join(set(str(int(x)) for x in seqs_for_msa[seqs_for_msa.db == "env_nt"]["gi"]))
print(gis_to_get)
r = requests.get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={}&rettype=fasta&retmode=text".format(gis_to_get))
with open("d9539_env_nt_homologs.fa","w") as env_nt_fh:
    env_nt_fh.write(str(r.content.decode()))


935523605,934409231,842490291,934040108,931667140,931877178,782230860

In [13]:
#Remove whitespace between seqs
!sed "/^$/d" d9539_env_nt_homologs.fa > d9539_env_nt_homologs.fasta
!rm d9539_env_nt_homologs.fa