In [2]:
import pandas as pd

In [3]:
import requests
import time

In [45]:
def ena_collections():
    resp = requests.get("http://www.ebi.ac.uk/ena/search/showQueryCollections", params={'type': 'sensitive'})
    available_collections = {}
    for line in resp.content.split('\r\n')[:-1]:
        collection_id, collection_name, display_name, collection = line.split('\t')
        available_collections[int(collection_id)] = {
            'collection_id': int(collection_id),
            'collection_name': collection_name,
            'display_name': display_name,
            'collection': collection,
        }
    return available_collections

def search_ena(seq):
    resp = requests.get("http://www.ebi.ac.uk/ena/search/executeSearch",
                        params={'Sequence': seq,
                                'type': 'sensitive'})
    jobid = resp.content.strip().split('job_id=')[-1]
    
    resp_s = requests.get(resp.content.strip(), cookies=resp.cookies)
    search_response = dict('status', 'space', 'servers completed', 'total servers', 'alignments available'),
                           resp_s.content.strip().split('\t')))

    while search_response['status'] == 'SEARCHING':
        #print search_response['servers completed'], search_response['total servers'], '\r',
        time.sleep(5)
        resp_s = requests.get(resp.content.strip(), cookies=resp.cookies)
        search_response = dict(zip(('status', 'space', 'servers completed', 'total servers', 'alignments available'),
                               resp_s.content.strip().split('\t')))

    resp_r = requests.get('http://www.ebi.ac.uk/ena/search/searchResults',
                      params={'job_id': jobid,
                              'fields': 'accession,data_source,description,e_value,identity,target_length,organism'},
                      cookies=resp.cookies)

    alignments_dict = {}
    for line in resp_r.content.split('\r\n')[:-1]:
        data = line.split('\t')
        alignments_dict[data[0]] = {'accession': data[0],
                                    'data_source': data[1],
                                    'description': data[2],
                                    'e_value': float(data[3]),
                                    'identity': int(data[4]),
                                    'target_length': int(data[5]),
                                    'organism': data[6]}
    
    alignments = pd.DataFrame.from_dict(alignments_dict).T
    sorted_alignments = alignments[alignments.target_length <= len(ena_seq)].sort(columns=['target_length', 'identity'], ascending=False)
    
    return sorted_alignments

In [46]:
ena_seq = 'TATTCAATAAGTCAATATCATGCCGTTAATATGTTGCCATCCGTGGCAATCATGCTGCTAACGTGTGACCGCATTCAAAATGTTGTCTGCGATTGACTCTTCTTTGTGGCATTGCACCACCAGAGCGTCATACAGCGGCTTAACAGTGCGTGACCAGGTGGGTTGGGTAAGGTTTGGGGTTAGCATCGTCACAGCGCGATATGCTGCGCTTGCTGGCATCCTTGAATAGCCGACGCCTTTGCATCTTCCGCACTCTTTCTCGACAACTCTCCCCCACAGCTCTGTTTTGGCAATATCAACCGCACGGCCTGTACCATGACAATCTCTGCATCTTGCCCCCGGCGTCGCGGCACTACGGCAATAATCCGCATAAGCGAATGTTGCGAGCACTTGCAGTACCTTTGCCTTAGTATTTCCTTCAAGCTTTGCCACACCACGGTATTTCCCCGATACCTTGTGTGCAAATTGCATCAGATAGTTGATAGCCTTTTGTTTGTCGTTCTGGCTGAGTTCGTGCTTACCGCAGAATGCAGCCATACCGAATCCG'

In [47]:
sorted_alignments = search_ena(ena_seq)

In [48]:
(sorted_alignments
    .organism
    .head(100)
    .apply(lambda x: " ".join(x.split()[:2]))
    .value_counts())


Out[48]:
Escherichia coli            57
Enterobacteria phage         9
Cloning vector               7
null                         6
Mus musculus                 5
Shigella flexneri            5
synthetic construct          3
Arachis hypogaea             2
Bombyx mori                  1
Streptomyces vinaceus        1
Rhodococcus erythropolis     1
[Kitasatospora] papulosa     1
Enterobacterial phage        1
Malus domestica              1
dtype: int64

In [ ]: