In [4]:
import os
import numpy as np
from tqdm import tqdm
import subprocess
import pickle
import subprocess
import xml.etree.ElementTree as ET
In [27]:
# this method is likely a stub
def extract_metadata(v, result, mode = None, print_tree = False, indent = 0):
tag = v.tag.split('}')[1]
if print_tree:
print('-'*indent + ' ' + tag, end = '')
if tag == 'title' or tag == 'date' or mode == 'abstract':
if mode is None:
mode = tag
if v.text is not None:
result[mode] = v.text
if print_tree:
print(': ' + v.text)
else:
if print_tree:
print('')
for c in v.getchildren():
m = tag if tag == 'abstract' else None
extract_metadata(c, result, m, print_tree, indent + 1)
In [36]:
# this method is likely a stub
def extract_references(v, result, mode = None):
tag = v.tag.split('}')[1]
if tag == 'title':
if v.text is not None:
result[-1][mode][tag] = v.text
else:
if tag not in result[-1][mode]:
result[-1][mode][tag] = '<None>'
if tag == 'date':
result[-1][mode][tag] = v.attrib['when']
m = None
if tag == 'biblStruct':
result.append({})
m = tag
if tag == 'analytic' or tag == 'monogr':
result[-1][tag] = {}
m = tag
if m is None:
m = mode
for c in v.getchildren():
extract_references(c, result, m)
In [37]:
# this method is likely a stub
def process_refs(refs):
prefs = []
for ref in refs:
pref = {}
if 'analytic' in ref and 'monogr' in ref:
pref['title'] = ref['analytic']['title']
if 'date' in ref['monogr']:
pref['date'] = ref['monogr']['date']
elif 'monogr' in ref:
pref = ref['monogr']
elif 'analytic' in ref:
pref = ref['analytic']
prefs.append(pref)
return prefs
The following code requires GROBID to be launhed in a service-mode at localhost:8080, see http://grobid.readthedocs.io/en/latest/Grobid-service/
In [38]:
# run a shell command from python
# the result is a string with parsed document header into XML format
output = subprocess.check_output("curl -v --form input=@./papers/1009.5419v2.pdf localhost:8080/processHeaderDocument",
shell=True)
# decode string into ElementTree representation of XML
tree = ET.ElementTree(ET.fromstring(output))
root = tree.getroot()
# traverse the tree recursively and extract fields with title, date and abstarct
metadata = {}
extract_metadata(root, metadata, print_tree = True)
In [39]:
metadata
Out[39]:
In [40]:
output = subprocess.check_output("curl -v --form input=@./papers/1009.5419v2.pdf localhost:8080/processReferences",
shell=True)
tree = ET.ElementTree(ET.fromstring(output))
root = tree.getroot()
references = []
extract_references(root, references)
processed_references = process_refs(references)
In [41]:
processed_references
Out[41]:
In [42]:
def get_paper_metadata(path):
output = subprocess.check_output("curl -v --form input=@" + path + " localhost:8080/processHeaderDocument",
shell=True)
tree = ET.ElementTree(ET.fromstring(output))
root = tree.getroot()
metadata = {}
extract_metadata(root, metadata)
return metadata
In [43]:
def get_paper_references(path):
output = subprocess.check_output("curl -v --form input=@" + path + " localhost:8080/processReferences",
shell=True)
tree = ET.ElementTree(ET.fromstring(output))
root = tree.getroot()
references = []
extract_references(root, references)
return process_refs(references)
In [44]:
papers_data = {}
exceptions = []
for folder_path in ['papers']:
for filename in tqdm(os.listdir(folder_path)):
try:
path = folder_path + '/' + filename
papers_data[path] = {'metadata':get_paper_metadata('"./' + path + '"'),
'references':get_paper_references('"./' + path + '"')}
except KeyboardInterrupt:
raise
except Exception as e: # for some reason failes to process files with names containing commas ','
exceptions.append(folder_path + '/' + filename)
print(e, folder_path + '/' + filename)
In [47]:
# store mined data
pickle.dump(papers_data, open('papers_data.pkl', 'wb'))