"""" python_arXiv_paging_example.py

This script dumps metadata from arxiv API and stores it
in a mongodb database. There is a 3 seconds wait between api calls.

Please see the documentation at 
http://export.arxiv.org/api_help/docs/user-manual.html
for more information, or email the arXiv api 
mailing list at arxiv-api@googlegroups.com.

""""


In [5]:
import urllib.request
import time
import feedparser

from pymongo import MongoClient

In [6]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# initiating mongo db
client = MongoClient('localhost:27017')
db = client.arXivDB

In [7]:
search_query='cat:quant-ph'
start=db.arXivfeeds.count()
results_per_iteration=2000
wait_time=3
no_retries = 10

# fetching the total number of results
query = 'search_query=%s&start=%i&max_results=%i' % (search_query, 0, 1)
response = urllib.request.urlopen(base_url+query).read()
feed = feedparser.parse(response)
total_results = int(feed.feed.opensearch_totalresults)
# total_results = 100

print('Searching arXiv for {}'.format(search_query))
print('Found a total of {} entries'.format(total_results))


Searching arXiv for cat:quant-ph
Found a total of 76175 entries

In [10]:
response


Out[10]:
b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dcat%3Aquant-ph%26id_list%3D%26start%3D0%26max_results%3D1" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=cat:quant-ph&amp;id_list=&amp;start=0&amp;max_results=1</title>\n  <id>http://arxiv.org/api/mHFE3HizSfzBReI+cza2RZg8lQQ</id>\n  <updated>2016-10-27T00:00:00-04:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">76175</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>\n  <entry>\n    <id>http://arxiv.org/abs/quant-ph/9412001v1</id>\n    <updated>1994-12-21T09:22:43Z</updated>\n    <published>1994-12-21T09:22:43Z</published>\n    <title>Photon Statistics of a Two-Mode Squeezed Vacuum</title>\n    <summary>  We investigate the general case of the photon distribution of a two-mode\nsqueezed vacuum and show that the distribution of photons among the two modes\ndepends on four parameters: two squeezing parameters, the relative phase\nbetween the two oscillators and their spatial orientation. The distribution of\nthe total number of photons depends only on the two squeezing parameters. We\nderive analytical expressions and present pictures for both distributions.\n</summary>\n    <author>\n      <name>G. Schrade</name>\n    </author>\n    <author>\n      <name>V. M. Akulin</name>\n    </author>\n    <author>\n      <name>W. P. Schleich</name>\n    </author>\n    <author>\n      <name>V. I. Man\'ko</name>\n    </author>\n    <arxiv:doi xmlns:arxiv="http://arxiv.org/schemas/atom">10.1103/PhysRevA.48.2398</arxiv:doi>\n    <link title="doi" href="http://dx.doi.org/10.1103/PhysRevA.48.2398" rel="related"/>\n    <arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">LATEX, 6 pages, Contribution to the third International Workshop on\n  Squeezed States and Uncertainty Relations, Baltimore, August 1993</arxiv:comment>\n    <arxiv:journal_ref xmlns:arxiv="http://arxiv.org/schemas/atom">Phys.Rev.A48:2398-2406,1993</arxiv:journal_ref>\n    <link href="http://arxiv.org/abs/quant-ph/9412001v1" rel="alternate" type="text/html"/>\n    <link title="pdf" href="http://arxiv.org/pdf/quant-ph/9412001v1" rel="related" type="application/pdf"/>\n    <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="quant-ph" scheme="http://arxiv.org/schemas/atom"/>\n    <category term="quant-ph" scheme="http://arxiv.org/schemas/atom"/>\n  </entry>\n</feed>\n'

In [27]:
for i in range(start,total_results,results_per_iteration):

    print("Results {} to {}".format(i, i+results_per_iteration))

    query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                         i,
                                                         results_per_iteration)

    for j in range(no_retries):
        # perform a GET request using the base_url and query
        response = urllib.request.urlopen(base_url+query).read()

        # parse the response using feedparser
        feed = feedparser.parse(response)

        if feed.entries:
            dates = [entry.published for entry in feed.entries]
            print('entries from {} to {}'.format(dates[0], dates[-1]))

            # dumping the entries into mongo db
            db.arXivfeeds.insert_many(feed.entries)
            
            print('Read successful. Sleeping for %i seconds' % wait_time) 
            time.sleep(wait_time)
            break
        else:
            print('reading from the API failed, retrying in {} seconds'.format(wait_time**(j+1)))
            time.sleep(wait_time**(j+1))

    # playing nice and sleeping a bit before next call
    # to the api again!

print('download and dump done!')


Searching arXiv for cat:quant-ph
Found a total of 76091 entries
Results 50000 to 52000
reading from the API failed, retrying in 3 seconds
reading from the API failed, retrying in 9 seconds
reading from the API failed, retrying in 27 seconds
reading from the API failed, retrying in 81 seconds
reading from the API failed, retrying in 243 seconds
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-27-c0c5ca39e1c0> in <module>()
     42         else:
     43             print('reading from the API failed, retrying in {} seconds'.format(wait_time**(j+1)))
---> 44             time.sleep(wait_time**(j+1))
     45 
     46     # playing nice and sleeping a bit before next call

KeyboardInterrupt: 

In [ ]: