This is an example how to query and parse Arxiv for articles.
original taken from https://arxiv.org/help/api/examples/python_arXiv_parsing_example.txt
API is documented here http://arxiv.org/help/api/user-manual
In [ ]:
# to get latest publications
#search_query = 'astro-ph'
# or search for a specfic term
search_query = 'all:docker'
sortBy = 'lastUpdatedDate' # can be relevance, lastUpdatedDate, submittedDate
sortOrder = 'descending' # can be either ascending or descending
# retreive the first 5 results
start = 0
max_results = 10
In [ ]:
import urllib
import feedparser
import pandas as pd
In [ ]:
# Base api query url. Probable no need to change
base_url = 'http://export.arxiv.org/api/query?';
fields = ('search_query','start','max_results', 'sortBy', 'sortOrder')
query = '&'.join([f + '=' + str(globals()[f]) for f in fields])
In [ ]:
# Opensearch metadata such as totalResults, startIndex,
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'
# perform a GET request using the base_url and query
response = urllib.urlopen(base_url+query).read()
# parse the response using feedparser
feed = feedparser.parse(response)
In [ ]:
# print out feed information
print 'Feed title: %s' % feed.feed.title
print 'Feed last updated: %s' % feed.feed.updated
In [ ]:
# print opensearch metadata
print 'totalResults for this query: %s' % feed.feed.opensearch_totalresults
print 'itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage
print 'startIndex for this query: %s' % feed.feed.opensearch_startindex
In [ ]:
rows = []
In [ ]:
# Run through each entry, and print out information
for entry in feed.entries:
row = {}
row['arxiv-id'] = entry.id.split('/abs/')[-1]
row['Published'] = entry.published
row['Title'] = entry.title
row['Last Author'] = entry.author
if 'arxiv_affiliation' in entry:
row['Last Author'] += ' (%s)' % entry.arxiv_affiliation
row['Authors'] = ', '.join(author.get('name', '?') for author in entry.get('authors', []))
# get the links to the abs page and pdf for this e-print
for link in entry.links:
if link.rel == 'alternate':
row['page'] = link.href
elif link.title == 'pdf':
row['pdf'] = link.href
# The journal reference, comments and primary_category sections live under
# the arxiv namespace
row['journal_ref'] = entry.get('arxiv_journal_ref', '-')
row['Comments'] = entry.get('arxiv_comment', '-')
# Since the <arxiv:primary_category> element has no data, only
# attributes, feedparser does not store anything inside
# entry.arxiv_primary_category
# This is a dirty hack to get the primary_category, just take the
# first element in entry.tags. If anyone knows a better way to do
# this, please email the list!
row['Primary Category'] = entry.tags[0]['term']
# Lets get all the categories
all_categories = [t['term'] for t in entry.tags]
row['All Categories'] = ', '.join(all_categories)
# The abstract is in the <summary> element
row['Abstract'] = entry.summary
rows.append(row)
In [ ]:
pd.DataFrame(rows)