"""" python_arXiv_paging_example.py
This script dumps metadata from arxiv API and stores it
in a mongodb database. There is a 3 seconds wait between api calls.
Please see the documentation at
http://export.arxiv.org/api_help/docs/user-manual.html
for more information, or email the arXiv api
mailing list at arxiv-api@googlegroups.com.
""""
In [5]:
import urllib.request
import time
import feedparser
from pymongo import MongoClient
In [6]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';
# initiating mongo db
client = MongoClient('localhost:27017')
db = client.arXivDB
In [7]:
search_query='cat:quant-ph'
start=db.arXivfeeds.count()
results_per_iteration=2000
wait_time=3
no_retries = 10
# fetching the total number of results
query = 'search_query=%s&start=%i&max_results=%i' % (search_query, 0, 1)
response = urllib.request.urlopen(base_url+query).read()
feed = feedparser.parse(response)
total_results = int(feed.feed.opensearch_totalresults)
# total_results = 100
print('Searching arXiv for {}'.format(search_query))
print('Found a total of {} entries'.format(total_results))
In [10]:
response
Out[10]:
In [27]:
for i in range(start,total_results,results_per_iteration):
print("Results {} to {}".format(i, i+results_per_iteration))
query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
i,
results_per_iteration)
for j in range(no_retries):
# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()
# parse the response using feedparser
feed = feedparser.parse(response)
if feed.entries:
dates = [entry.published for entry in feed.entries]
print('entries from {} to {}'.format(dates[0], dates[-1]))
# dumping the entries into mongo db
db.arXivfeeds.insert_many(feed.entries)
print('Read successful. Sleeping for %i seconds' % wait_time)
time.sleep(wait_time)
break
else:
print('reading from the API failed, retrying in {} seconds'.format(wait_time**(j+1)))
time.sleep(wait_time**(j+1))
# playing nice and sleeping a bit before next call
# to the api again!
print('download and dump done!')
In [ ]: