ArXiv parser

This is an example how to query and parse Arxiv for articles.

original taken from https://arxiv.org/help/api/examples/python_arXiv_parsing_example.txt

API is documented here http://arxiv.org/help/api/user-manual

Search parameters


In [ ]:
# to get latest publications
#search_query = 'astro-ph'

# or search for a specfic term
search_query = 'all:docker' 

sortBy = 'lastUpdatedDate' # can be relevance, lastUpdatedDate, submittedDate
sortOrder = 'descending'    # can be either ascending or descending

# retreive the first 5 results
start = 0                     
max_results = 10

The code


In [ ]:
import urllib
import feedparser
import pandas as pd

In [ ]:
# Base api query url. Probable no need to change
base_url = 'http://export.arxiv.org/api/query?';

fields = ('search_query','start','max_results', 'sortBy', 'sortOrder')
query = '&'.join([f + '=' + str(globals()[f]) for f in fields])

In [ ]:
# Opensearch metadata such as totalResults, startIndex, 
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

# perform a GET request using the base_url and query
response = urllib.urlopen(base_url+query).read()

# parse the response using feedparser
feed = feedparser.parse(response)

In [ ]:
# print out feed information
print 'Feed title: %s' % feed.feed.title
print 'Feed last updated: %s' % feed.feed.updated

In [ ]:
# print opensearch metadata
print 'totalResults for this query: %s' % feed.feed.opensearch_totalresults
print 'itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage
print 'startIndex for this query: %s'   % feed.feed.opensearch_startindex

In [ ]:
rows = []

In [ ]:
# Run through each entry, and print out information
for entry in feed.entries:
    row = {}
    row['arxiv-id'] = entry.id.split('/abs/')[-1]
    row['Published'] = entry.published
    row['Title'] = entry.title
    
    row['Last Author'] = entry.author
                                             
    if 'arxiv_affiliation' in entry:
         row['Last Author'] += ' (%s)' % entry.arxiv_affiliation                            
    
    row['Authors'] = ', '.join(author.get('name', '?') for author in entry.get('authors', []))

    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            row['page'] = link.href
        elif link.title == 'pdf':
            row['pdf'] = link.href
    
    # The journal reference, comments and primary_category sections live under 
    # the arxiv namespace
    row['journal_ref'] = entry.get('arxiv_journal_ref',  '-')
    
    row['Comments'] = entry.get('arxiv_comment',  '-')
    
    # Since the <arxiv:primary_category> element has no data, only
    # attributes, feedparser does not store anything inside
    # entry.arxiv_primary_category
    # This is a dirty hack to get the primary_category, just take the
    # first element in entry.tags.  If anyone knows a better way to do
    # this, please email the list!
    row['Primary Category'] = entry.tags[0]['term']
    
    # Lets get all the categories
    all_categories = [t['term'] for t in entry.tags]
    row['All Categories'] =  ', '.join(all_categories)
    
    # The abstract is in the <summary> element
    row['Abstract'] = entry.summary
    rows.append(row)

In [ ]:
pd.DataFrame(rows)