In [4]:
import os
import numpy as np
from tqdm import tqdm
import subprocess
import pickle
import subprocess
import xml.etree.ElementTree as ET

In [27]:
# this method is likely a stub
def extract_metadata(v, result, mode = None, print_tree = False, indent = 0):
    tag = v.tag.split('}')[1]
    if print_tree:
        print('-'*indent + ' ' + tag, end = '')
    if tag == 'title' or tag == 'date' or mode == 'abstract':
        if mode is None:
            mode = tag
        if v.text is not None:
            result[mode] = v.text
            if print_tree:
                print(': ' + v.text)
    else:
        if print_tree:
            print('')
    for c in v.getchildren():
        m = tag if tag == 'abstract' else None            
        extract_metadata(c, result, m, print_tree, indent + 1)

In [36]:
# this method is likely a stub
def extract_references(v, result, mode = None):
    tag = v.tag.split('}')[1]
    if tag == 'title':
        if v.text is not None:
            result[-1][mode][tag] = v.text
        else:
            if tag not in result[-1][mode]:
                result[-1][mode][tag] = '<None>'
    if tag == 'date':
        result[-1][mode][tag] = v.attrib['when']
        
    m = None
    if tag == 'biblStruct':
        result.append({})
        m = tag
    if tag == 'analytic' or tag == 'monogr':
        result[-1][tag] = {}
        m = tag
    
    if m is None:
        m = mode
    for c in v.getchildren():
        extract_references(c, result, m)

In [37]:
# this method is likely a stub
def process_refs(refs):
    prefs = []
    for ref in refs:
        pref = {}
        if 'analytic' in ref and 'monogr' in ref:    
            pref['title'] = ref['analytic']['title']
            if 'date' in ref['monogr']:
                pref['date'] = ref['monogr']['date']
        elif 'monogr' in ref:
            pref = ref['monogr']
        elif 'analytic' in ref:
            pref = ref['analytic']
        prefs.append(pref)
    return prefs

requirement

The following code requires GROBID to be launhed in a service-mode at localhost:8080, see http://grobid.readthedocs.io/en/latest/Grobid-service/

example for extracting metadata from pdf

For the example the following path is used './papers/1009.5419v2.pdf'


In [38]:
# run a shell command from python
# the result is a string with parsed document header into XML format
output = subprocess.check_output("curl -v --form input=@./papers/1009.5419v2.pdf localhost:8080/processHeaderDocument", 
                                 shell=True)
# decode string into ElementTree representation of XML
tree = ET.ElementTree(ET.fromstring(output))
root = tree.getroot()

# traverse the tree recursively and extract fields with title, date and abstarct 
metadata = {}
extract_metadata(root, metadata, print_tree = True)


 TEI
- teiHeader
-- encodingDesc
--- appInfo
---- application
----- ref
-- fileDesc
--- titleStmt
---- title: Portfolio Allocation for Bayesian Optimization
--- publicationStmt
---- publisher
---- availability
----- licence
---- date: March 8, 2011
--- sourceDesc
---- biblStruct
----- analytic
------ author
------- persName
-------- forename
-------- surname
------- affiliation
-------- orgName
-------- orgName
-------- address
--------- country
------ author
------- persName
-------- forename
-------- surname
------- affiliation
-------- orgName
-------- orgName
-------- address
--------- country
------ author
------- persName
-------- forename
-------- surname
------- affiliation
-------- orgName
-------- orgName
-------- address
--------- country
------ title: Portfolio Allocation for Bayesian Optimization
----- monogr
------ imprint
------- date: March 8, 2011
-- profileDesc
--- abstract
---- p: Bayesian optimization with Gaussian processes has become an increasingly popular tool in the machine learning community. It is efficient and can be used when very little is known about the objective function, making it popular in expensive black-box optimization scenarios. It uses Bayesian methods to sample the objective efficiently using an acquisition function which incorporates the model's estimate of the objective and the uncertainty at any given point. However, there are several different parameter-ized acquisition functions in the literature, and it is often unclear which one to use. Instead of using a single acquisition function, we adopt a portfolio of acquisition functions governed by an online multi-armed bandit strategy. We propose several portfolio strategies, the best of which we call GP-Hedge, and show that this method outperforms the best individual acquisition function. We also provide a theoretical bound on the algorithm's performance.
- text

In [39]:
metadata


Out[39]:
{'abstract': "Bayesian optimization with Gaussian processes has become an increasingly popular tool in the machine learning community. It is efficient and can be used when very little is known about the objective function, making it popular in expensive black-box optimization scenarios. It uses Bayesian methods to sample the objective efficiently using an acquisition function which incorporates the model's estimate of the objective and the uncertainty at any given point. However, there are several different parameter-ized acquisition functions in the literature, and it is often unclear which one to use. Instead of using a single acquisition function, we adopt a portfolio of acquisition functions governed by an online multi-armed bandit strategy. We propose several portfolio strategies, the best of which we call GP-Hedge, and show that this method outperforms the best individual acquisition function. We also provide a theoretical bound on the algorithm's performance.",
 'date': 'March 8, 2011',
 'title': 'Portfolio Allocation for Bayesian Optimization'}

example for extracting references from pdf

Similar workflow to extracting headers


In [40]:
output = subprocess.check_output("curl -v --form input=@./papers/1009.5419v2.pdf localhost:8080/processReferences", 
                                 shell=True)
tree = ET.ElementTree(ET.fromstring(output))
root = tree.getroot()

references = []
extract_references(root, references)

processed_references = process_refs(references)

In [41]:
processed_references


Out[41]:
[{'date': '2010', 'title': 'Best arm identification in multi-armed bandits'},
 {'date': '1998',
  'title': 'Gambling in a rigged casino: the adversarial multi-armed bandit problem'},
 {'date': '2007',
  'title': 'Gaussian Processes for Regression and Optimisation'},
 {'date': '2010',
  'title': 'A Bayesian interactive optimization approach to procedural animation design'},
 {'date': '2010',
  'title': 'A tutorial on Bayesian optimization of expensive cost functions with application to active user modeling and hierarchical reinforcement learning'},
 {'date': '2007',
  'title': 'Active preference learning with discrete choice data'},
 {'date': '2009', 'title': 'Pure exploration in multi-armed bandits problems'},
 {'date': '2006', 'title': '<None>'},
 {'date': '2009', 'title': 'A parameter-free hedging algorithm'},
 {'date': '2005', 'title': 'Preference learning with Gaussian processes'},
 {'date': '1997',
  'title': 'SDO: A statistical method for global optimization'},
 {'date': '1998', 'title': 'Model-based geostatistics'},
 {'date': '2001', 'title': 'Modification of the DIRECT Algorithm'},
 {'date': '2010',
  'title': 'Regret bounds for Gaussian process bandit problems'},
 {'date': '2009',
  'title': 'New inference strategies for solving Markov decision processes using reversible jump MCMC'},
 {'date': '2006-03',
  'title': 'Global optimization of stochastic black-box systems via sequential Kriging meta-models'},
 {'date': '2009-08',
  'title': 'Automating the Configuration of Algorithms for Solving Hard Computational Problems'},
 {'date': '2001',
  'title': 'A taxonomy of global optimization methods based on response surfaces'},
 {'date': '1993',
  'title': 'Lipschitzian optimization without the Lipschitz constant'},
 {'date': '1998',
  'title': 'Efficient global optimization of expensive black-box functions'},
 {'date': '1964',
  'title': 'A new method of locating the maximum of an arbitrary multipeak curve in the presence of noise'},
 {'date': '2008', 'title': 'Practical Bayesian Optimization'},
 {'date': '2007',
  'title': 'Automatic gait optimization with Gaussian process regression'},
 {'date': '2009',
  'title': 'A Bayesian exploration-exploitation approach for optimal online sensing and planning with a visually guided mobile robot'},
 {'date': '2007',
  'title': 'Active policy learning for robot planning and exploration under uncertainty'},
 {'date': '1978',
  'title': 'Toward Global Optimization chapter The Application of Bayesian Methods for Seeking the Extremum'},
 {'date': '2010',
  'title': 'Bayesian Gaussian Processes for Sequential Prediction, Optimization and Quadrature'},
 {'date': '2003',
  'title': 'Gaussian processes to speed up hybrid Monte Carlo for expensive Bayesian integrals'},
 {'date': '2006', 'title': 'Gaussian Processes for Machine Learning'},
 {'date': '2009',
  'title': 'Approximate Bayesian inference for latent Gaussian models by using integrated nested Laplace approximations'},
 {'date': '2010',
  'title': 'Gaussian process optimization in the bandit setting: No regret and experimental design'}]

miner for files in the folder

The following code processes the whole folder mining headers and references from each contained pdf


In [42]:
def get_paper_metadata(path):
    output = subprocess.check_output("curl -v --form input=@" + path + " localhost:8080/processHeaderDocument", 
                                 shell=True)
    tree = ET.ElementTree(ET.fromstring(output))
    root = tree.getroot()
    metadata = {}
    extract_metadata(root, metadata)
    return metadata

In [43]:
def get_paper_references(path):
    output = subprocess.check_output("curl -v --form input=@" + path + " localhost:8080/processReferences", 
                                 shell=True)
    tree = ET.ElementTree(ET.fromstring(output))
    root = tree.getroot()
    references = []
    extract_references(root, references)
    return process_refs(references)

In [44]:
papers_data = {}
exceptions = []
for folder_path in ['papers']:
    for filename in tqdm(os.listdir(folder_path)):
        try:
            path = folder_path + '/' + filename
            papers_data[path] = {'metadata':get_paper_metadata('"./' + path + '"'), 
                                'references':get_paper_references('"./' + path + '"')}
        except KeyboardInterrupt:
            raise
        except Exception as e: # for some reason failes to process files with names containing commas ','
            exceptions.append(folder_path + '/' + filename)
            print(e, folder_path + '/' + filename)


100%|██████████| 31/31 [00:26<00:00,  1.22it/s]

In [47]:
# store mined data
pickle.dump(papers_data, open('papers_data.pkl', 'wb'))