In [1]:
import Bio
from Bio import Entrez
Entrez.email = "vm@gmail.com"
In [2]:
handle = Entrez.einfo()
record = Entrez.read(handle)
print(record)
{u'DbList': ['pubmed', 'protein', 'nuccore', 'nucleotide', 'nucgss', 'nucest', 'structure', 'genome', 'assembly', 'genomeprj', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'clone', 'gap', 'gapplus', 'grasp', 'dbvar', 'epigenomics', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'journals', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'probe', 'proteinclusters', 'pcassay', 'biosystems', 'pccompound', 'pcsubstance', 'pubmedhealth', 'seqannot', 'snp', 'sra', 'taxonomy', 'toolkit', 'toolkitall', 'toolkitbook', 'unigene', 'gencoll', 'gtr']}
In [3]:
print(record.keys())
record['DbList']
[u'DbList']
Out[3]:
['pubmed', 'protein', 'nuccore', 'nucleotide', 'nucgss', 'nucest', 'structure', 'genome', 'assembly', 'genomeprj', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'clone', 'gap', 'gapplus', 'grasp', 'dbvar', 'epigenomics', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'journals', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'probe', 'proteinclusters', 'pcassay', 'biosystems', 'pccompound', 'pcsubstance', 'pubmedhealth', 'seqannot', 'snp', 'sra', 'taxonomy', 'toolkit', 'toolkitall', 'toolkitbook', 'unigene', 'gencoll', 'gtr']
In [3]:
#import xml.etree.cElementTree as ET
import bs4
In [6]:
dbname = 'sra'
handle = Entrez.esearch(db=dbname, retmax=10000, term='16S sequence bacteria')
records = Entrez.read(handle)
#print(records)
#with open('')
for recid in records['IdList']:
h = Entrez.efetch(db=dbname, id=recid, rettype="gb", retmode="xml")
rec = h.read()
#xml.sax.parseString(rec, SeqHandler())
soup = bs4.BeautifulSoup(rec, 'xml')
#print(soup.find_all('GBSeq_sequence').contents)
print (soup.GBSeq_sequence.contents[0].upper())
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-6-19d4ae9bbd8e> in <module>()
13 soup = bs4.BeautifulSoup(rec, 'xml')
14 #print(soup.find_all('GBSeq_sequence').contents)
---> 15 print (soup.GBSeq_sequence.contents[0].upper())
AttributeError: 'NoneType' object has no attribute 'contents'
In [8]:
hsearch = Entrez.esearch(db='pubmed', term='Cypripedioideae')
record = Entrez.read(hsearch)
record
Out[8]:
{u'Count': '10', u'RetMax': '10', u'IdList': ['24001522', '22685605', '21718793', '21241312', '19168860', '21642160', '15120407', '21653371', '21680361', '21684950'], u'TranslationStack': [{u'Count': '10', u'Field': 'All Fields', u'Term': 'Cypripedioideae[All Fields]', u'Explode': 'N'}, 'GROUP'], u'TranslationSet': [], u'RetStart': '0', u'QueryTranslation': 'Cypripedioideae[All Fields]'}
In [11]:
for rec_id in record['IdList'][0:1]:
h = Entrez.efetch(db="nucleotide", id=rec_id, rettype="gb", retmode="xml")
print(h.read())
<?xml version="1.0"?>
<!DOCTYPE GBSet PUBLIC "-//NCBI//NCBI GBSeq/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_GBSeq.dtd">
<GBSet>
<GBSeq>
<GBSeq_locus>BZ277537</GBSeq_locus>
<GBSeq_length>732</GBSeq_length>
<GBSeq_strandedness>double</GBSeq_strandedness>
<GBSeq_moltype>DNA</GBSeq_moltype>
<GBSeq_topology>linear</GBSeq_topology>
<GBSeq_division>GSS</GBSeq_division>
<GBSeq_update-date>15-OCT-2002</GBSeq_update-date>
<GBSeq_create-date>15-OCT-2002</GBSeq_create-date>
<GBSeq_definition>CH230-392F5.TJ CHORI-230 Segment 2 Rattus norvegicus genomic clone CH230-392F5, genomic survey sequence</GBSeq_definition>
<GBSeq_primary-accession>BZ277537</GBSeq_primary-accession>
<GBSeq_accession-version>BZ277537.1</GBSeq_accession-version>
<GBSeq_other-seqids>
<GBSeqid>gnl|dbGSS|4084010</GBSeqid>
<GBSeqid>gb|BZ277537.1|</GBSeqid>
<GBSeqid>gi|24001522</GBSeqid>
</GBSeq_other-seqids>
<GBSeq_keywords>
<GBKeyword>GSS</GBKeyword>
</GBSeq_keywords>
<GBSeq_source>Rattus norvegicus (Norway rat)</GBSeq_source>
<GBSeq_organism>Rattus norvegicus</GBSeq_organism>
<GBSeq_taxonomy>Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Sciurognathi; Muroidea; Muridae; Murinae; Rattus</GBSeq_taxonomy>
<GBSeq_references>
<GBReference>
<GBReference_reference>1</GBReference_reference>
<GBReference_position>1..732</GBReference_position>
<GBReference_authors>
<GBAuthor>Zhao,S.</GBAuthor>
<GBAuthor>Shetty,J.</GBAuthor>
<GBAuthor>Shatsman,S.</GBAuthor>
<GBAuthor>Tsegaye,G.</GBAuthor>
<GBAuthor>Geer,K.</GBAuthor>
<GBAuthor>Shvartsbeyn,A.</GBAuthor>
<GBAuthor>Gebregeorgis,E.</GBAuthor>
<GBAuthor>Overton,L.</GBAuthor>
<GBAuthor>Russell,D.</GBAuthor>
<GBAuthor>Chen,D.</GBAuthor>
<GBAuthor>Riggs,F.</GBAuthor>
<GBAuthor>de Jong,P.</GBAuthor>
<GBAuthor>Fraser,C.M.</GBAuthor>
</GBReference_authors>
<GBReference_title>Rat BAC End Sequences from Library CHORI-230 MboI segment</GBReference_title>
<GBReference_journal>Unpublished</GBReference_journal>
</GBReference>
</GBSeq_references>
<GBSeq_comment>Other_GSSs: CH230-392F5.TV~Contact: Shaying Zhao~Department of Eukaryotic Genomics~The Institute for Genomic Research~9712 Medical Center Dr., Rockville, MD 20850, USA~Tel: 301 838 0200~Fax: 301 838 0208~Email: szhao@tigr.org~Clones are derived from the rat BAC library CHORI-230 (http://www.chori.org/bacpac/rat230.htm). For BAC library availability, please contact Pieter de Jong (pdejong@mail.cho.org). Clones may be purchased from BACPAC Resources (http://www.chori.org/bacpac/or ering_information.htm). BAC end page: http://www.tigr.org/tdb/bac_ends/rat/bac_end_intro.html~Plate: 392 row: F column: 5~Seq primer: SP6~Class: BAC ends</GBSeq_comment>
<GBSeq_feature-table>
<GBFeature>
<GBFeature_key>source</GBFeature_key>
<GBFeature_location>1..732</GBFeature_location>
<GBFeature_intervals>
<GBInterval>
<GBInterval_from>1</GBInterval_from>
<GBInterval_to>732</GBInterval_to>
<GBInterval_accession>BZ277537.1</GBInterval_accession>
</GBInterval>
</GBFeature_intervals>
<GBFeature_quals>
<GBQualifier>
<GBQualifier_name>organism</GBQualifier_name>
<GBQualifier_value>Rattus norvegicus</GBQualifier_value>
</GBQualifier>
<GBQualifier>
<GBQualifier_name>mol_type</GBQualifier_name>
<GBQualifier_value>genomic DNA</GBQualifier_value>
</GBQualifier>
<GBQualifier>
<GBQualifier_name>strain</GBQualifier_name>
<GBQualifier_value>BN/SsNHsd/MCW</GBQualifier_value>
</GBQualifier>
<GBQualifier>
<GBQualifier_name>db_xref</GBQualifier_name>
<GBQualifier_value>taxon:10116</GBQualifier_value>
</GBQualifier>
<GBQualifier>
<GBQualifier_name>clone</GBQualifier_name>
<GBQualifier_value>CH230-392F5</GBQualifier_value>
</GBQualifier>
<GBQualifier>
<GBQualifier_name>sex</GBQualifier_name>
<GBQualifier_value>Female</GBQualifier_value>
</GBQualifier>
<GBQualifier>
<GBQualifier_name>cell_type</GBQualifier_name>
<GBQualifier_value>Brain</GBQualifier_value>
</GBQualifier>
<GBQualifier>
<GBQualifier_name>clone_lib</GBQualifier_name>
<GBQualifier_value>LIBGSS_003368 CHORI-230 Segment 2</GBQualifier_value>
</GBQualifier>
<GBQualifier>
<GBQualifier_name>note</GBQualifier_name>
<GBQualifier_value>Vector: pTARBAC1.3; Site_1: MboI; Site_2: MboI; CHORI-230 Rat (BN/SsNHsd/MCW) BAC library produced by Pieter de Jong</GBQualifier_value>
</GBQualifier>
</GBFeature_quals>
</GBFeature>
</GBSeq_feature-table>
<GBSeq_sequence>tccaacttagaaaaaactcaaacatgcacagtacaacttaacaatgagtttgctttatgtcattttgatatattttgactttgtgggccaaatttggaaaagaaccaacaaacaaggcaattacctttccaaaaaacacctttaccactatgcagtcattcttttgagcctttctgacagctaagactataatttgggtgggtctgaaagggatgatagagatagaaagttggttttggaaaccagagaaccataccctaaaaatatgactggctagagaaatttgtctaaggccagtaagccgcttgattccacttgtacaatcaccccaatgaggcctatgtgaacggatgagtttgtcaggaggaagaacaccttggtttacaggagctctatagaacgtgatttctagaatcctttctaagagacagaaggtcatactggcgtcctcactgtacctttacttagctcatttgcatacagaaccatctggtacttcagttctattctgagaaatgaataccaaggtcttacatgtcacatgaatgtaggtatgaaagactcttctacaaagagatactgtgcactatacaactgtgcccaactgcccacgacatccactgatccatcagaatgggcatttcttcttgggttttcatgcaggccatctagtcaatgaacttaaccaccaccggcgcaaccgtgagtagactttttctgtcaccccaattt</GBSeq_sequence>
<GBSeq_xrefs>
<GBXref>
<GBXref_dbname>BioSample</GBXref_dbname>
<GBXref_id>LIBGSS_003368</GBXref_id>
</GBXref>
</GBSeq_xrefs>
</GBSeq>
</GBSet>
In [5]:
print(record['sra'])
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-5-d37043a56955> in <module>()
----> 1 print(record['sra'])
KeyError: 'sra'
In [ ]:
Content source: mirjalil/DataScience
Similar notebooks: