In [1]:
import pandas as pd
from pandas import DataFrame, Series, Index
import numpy as np
import matplotlib.pyplot as plt

Hint from http://api.plos.org/search-examples/plos_search.py about how to get only articles in search:

query['fq'] = quote('doc_type:full AND !article_type_facet:"Issue Image"') #search only for articles

We need to be mindful of the API rate limit -- but PLoS has

http://api.plos.org/solr/faq/#solr_api_recommended_usage:

Please limit your API requests to 7200 requests a day, 300 per hour, 10 per minute and allow 5 seconds for your search to return results. If you exceed this threshold, we will lock out your IP address. If you’re a high-volume user of the PLOS Search API and need more API requests a day, please contact us at api@plos.org to discuss your options. We currently limit API users to no more than five concurrent connections from a single IP address.

[...]

PLOS Search API requests: Please do not send requests that return more than 100 rows. That’s a lot of data for our network to push all at once and it may take some time to return the result set. If you are getting back a result set that bigger than 100+ rows, then you likely need to change your query to return a smaller result set or set a limit on the records returned and page through the results.


In [5]:
import settings
import requests
import urllib

# http://api.plos.org/search-examples/plos_search.py
#  query['fq'] = quote('doc_type:full AND !article_type_facet:"Issue Image"') 
#search only for articles

def plos_search(q,start=0,rows=100,fl=None, extras=None):

    BASE_URL = 'http://api.plos.org/search'
    DEFAULT_FL = ('abstract','article_type','author_display',
                  'eissn','id','journal','publication_date',
                  'score','title_display')
    
    # fl indicates fields to return
    # http://wiki.apache.org/solr/CommonQueryParameters#fl
    
    if fl is None:
        fl_ = ",".join(DEFAULT_FL)
    else:
        fl_ = ",".join(fl)
        
    query = {'q':q,
             'start':start,
             'rows':rows,
             'api_key':settings.PLOS_KEY,
             'wt':'json',
             'fl':fl_,
             'fq': 'doc_type:full AND !article_type_facet:"Issue Image"'}
    
    if extras is not None:
        query.update(extras)
        
    query_url = BASE_URL + "?" +urllib.urlencode(query)
    
    r = requests.get(query_url)
    return r

In [6]:
r = plos_search(q='subject:"biotechnology"')

In [7]:
r.json()['response']['numFound']


Out[7]:
3577

In [8]:
docs = r.json()['response']['docs']

In [9]:
df = DataFrame(docs)
df.head()


Out[9]:
abstract article_type author_display eissn id journal publication_date score title_display
0 [\nThe objective of this paper is to assess th... Research Article [Latifah Amin, Md. Abul Kalam Azad, Mohd Hanaf... 1932-6203 10.1371/journal.pone.0086174 PLoS ONE 2014-01-29T00:00:00Z 1.212916 Determinants of Public Attitudes to Geneticall...
1 [\n Atrazine (ATZ) and S-metolachlor (S... Research Article [Cristina A. Viegas, Catarina Costa, Sandra An... 1932-6203 10.1371/journal.pone.0037140 PLoS ONE 2012-05-15T00:00:00Z 1.120443 Does <i>S</i>-Metolachlor Affect the Performan...
2 [\nDue to environmental persistence and biotox... Research Article [Yonggang Yang, Meiying Xu, Zhili He, Jun Guo,... 1932-6203 10.1371/journal.pone.0070686 PLoS ONE 2013-08-05T00:00:00Z 1.120443 Microbial Electricity Generation Enhances Deca...
3 [\n Intensive use of chlorpyrifos has r... Research Article [Shaohua Chen, Chenglan Liu, Chuyan Peng, Hong... 1932-6203 10.1371/journal.pone.0047205 NaN 2012-10-08T00:00:00Z 1.120443 Biodegradation of Chlorpyrifos and Its Hydroly...
4 [Background: The complex characteristics and u... Research Article [Zhongbo Zhou, Fangang Meng, So-Ryong Chae, Gu... 1932-6203 10.1371/journal.pone.0042270 NaN 2012-08-09T00:00:00Z 0.990341 Microbial Transformation of Biomacromolecules ...

5 rows × 9 columns


In [11]:
# get subjects of a given article
# http://api.plos.org/search?q=id:%2210.1371/journal.pbio.0050082%22&fl=id,subject,subject_level_1&api_key=[YOUR_API_KEY]

r = plos_search(q="10.1371/journal.pone.0039504",
            fl=('id','subject','subject_level'))

In [12]:
r.json()


Out[12]:
{u'response': {u'docs': [{u'id': u'10.1371/journal.pone.0039504',
    u'subject': [u'/Medicine and health sciences/Physiology/Physiological parameters/Body weight/Body mass index',
     u'/Biology and life sciences/Anatomy/Musculoskeletal system/Pelvis/Hip',
     u'/Medicine and health sciences/Physiology/Physiological parameters',
     u'/Biology and life sciences/Physiology/Physiological parameters/Body weight/Body mass index',
     u'/Biology and life sciences/Physiology/Physiological parameters/Body weight/Obesity',
     u'/Medicine and health sciences/Metabolic disorders/Diabetes mellitus',
     u'/Medicine and health sciences/Anatomy/Musculoskeletal system/Pelvis/Hip',
     u'/People and places/Demography/Death rates',
     u'/Biology and life sciences/Physiology/Physiological parameters',
     u'/Medicine and health sciences/Epidemiology/Ethnic epidemiology',
     u'/Medicine and health sciences/Physiology/Physiological parameters/Body weight/Obesity',
     u'/Biology and life sciences/Population biology/Population metrics/Death rates',
     u'/Physical sciences/Mathematics/Statistics (mathematics)/Confidence intervals']}],
  u'numFound': 1,
  u'start': 0}}

In [13]:
# There is also a list of all the top-level subject areas and their article counts: 
# http://api.plos.org/search/?q=*:*&fq=doc_type:full&rows=0&facet.field=subject_facet&facet.mincount=1

# this doesn't seem to work for me:
# https://groups.google.com/forum/#!searchin/plos-api-developers/subject/plos-api-developers/BqECTQkvRTI/r9v6oCAAOPoJ

r = plos_search(q="*.*",
                extras={'fq':'doc_type:full',
                    'facet.field':'subject_facet',
                    'facet.mincount':1
                    })
r


Out[13]:
<Response [200]>

In [14]:
r.json()


Out[14]:
{u'response': {u'docs': [], u'maxScore': 0.0, u'numFound': 0, u'start': 0}}

for simplicity, try to limit to PLoS One to start with


In [ ]:
# compute links back to journal, XML document

In [16]:
# get all articles

r = plos_search(q="*:*")
r


Out[16]:
<Response [200]>

In [17]:
r.json()['response']['numFound']


Out[17]:
118545

In [ ]: