In [2]:
import pandas as pd
In [3]:
import requests
import time
In [45]:
def ena_collections():
resp = requests.get("http://www.ebi.ac.uk/ena/search/showQueryCollections", params={'type': 'sensitive'})
available_collections = {}
for line in resp.content.split('\r\n')[:-1]:
collection_id, collection_name, display_name, collection = line.split('\t')
available_collections[int(collection_id)] = {
'collection_id': int(collection_id),
'collection_name': collection_name,
'display_name': display_name,
'collection': collection,
}
return available_collections
def search_ena(seq):
resp = requests.get("http://www.ebi.ac.uk/ena/search/executeSearch",
params={'Sequence': seq,
'type': 'sensitive'})
jobid = resp.content.strip().split('job_id=')[-1]
resp_s = requests.get(resp.content.strip(), cookies=resp.cookies)
search_response = dict('status', 'space', 'servers completed', 'total servers', 'alignments available'),
resp_s.content.strip().split('\t')))
while search_response['status'] == 'SEARCHING':
#print search_response['servers completed'], search_response['total servers'], '\r',
time.sleep(5)
resp_s = requests.get(resp.content.strip(), cookies=resp.cookies)
search_response = dict(zip(('status', 'space', 'servers completed', 'total servers', 'alignments available'),
resp_s.content.strip().split('\t')))
resp_r = requests.get('http://www.ebi.ac.uk/ena/search/searchResults',
params={'job_id': jobid,
'fields': 'accession,data_source,description,e_value,identity,target_length,organism'},
cookies=resp.cookies)
alignments_dict = {}
for line in resp_r.content.split('\r\n')[:-1]:
data = line.split('\t')
alignments_dict[data[0]] = {'accession': data[0],
'data_source': data[1],
'description': data[2],
'e_value': float(data[3]),
'identity': int(data[4]),
'target_length': int(data[5]),
'organism': data[6]}
alignments = pd.DataFrame.from_dict(alignments_dict).T
sorted_alignments = alignments[alignments.target_length <= len(ena_seq)].sort(columns=['target_length', 'identity'], ascending=False)
return sorted_alignments
In [46]:
ena_seq = 'TATTCAATAAGTCAATATCATGCCGTTAATATGTTGCCATCCGTGGCAATCATGCTGCTAACGTGTGACCGCATTCAAAATGTTGTCTGCGATTGACTCTTCTTTGTGGCATTGCACCACCAGAGCGTCATACAGCGGCTTAACAGTGCGTGACCAGGTGGGTTGGGTAAGGTTTGGGGTTAGCATCGTCACAGCGCGATATGCTGCGCTTGCTGGCATCCTTGAATAGCCGACGCCTTTGCATCTTCCGCACTCTTTCTCGACAACTCTCCCCCACAGCTCTGTTTTGGCAATATCAACCGCACGGCCTGTACCATGACAATCTCTGCATCTTGCCCCCGGCGTCGCGGCACTACGGCAATAATCCGCATAAGCGAATGTTGCGAGCACTTGCAGTACCTTTGCCTTAGTATTTCCTTCAAGCTTTGCCACACCACGGTATTTCCCCGATACCTTGTGTGCAAATTGCATCAGATAGTTGATAGCCTTTTGTTTGTCGTTCTGGCTGAGTTCGTGCTTACCGCAGAATGCAGCCATACCGAATCCG'
In [47]:
sorted_alignments = search_ena(ena_seq)
In [48]:
(sorted_alignments
.organism
.head(100)
.apply(lambda x: " ".join(x.split()[:2]))
.value_counts())
Out[48]:
In [ ]: