In [1]:
%load_ext autoreload
In [2]:
import os
import numpy as np
from tqdm import tqdm
import pickle
import subprocess
import xml.etree.ElementTree as ET
In [3]:
%autoreload
from parsing_utilities import *
In [4]:
input_folder = '/home/fmsnew/Documents/MFGPCpapers/papers_pdfs'
output_folder = '/home/fmsnew/Documents/MFGPCpapers/papers_processed'
In [7]:
parse_pdfs(input_folder, output_folder)
In [8]:
papers_data = {}
exceptions = []
# not recursive
for filename in os.listdir(input_folder):
if filename[-4:] == '.pdf':
try:
name = filename[:-4]
header_path = os.path.join(output_folder, name + '.tei.xml')
references_path = os.path.join(output_folder, name + '.references.tei.xml')
papers_data[name] = {'metadata':get_paper_metadata(header_path),
'references':get_paper_references(references_path)}
except KeyboardInterrupt:
raise
except Exception as e: # for some reason failes to process files with names containing commas ','
exceptions.append(name)
print(e, name)
In [9]:
pickle.dump(papers_data, open('../papers_data_mfgpc.pkl', 'wb'))
In [5]:
from joblib import Parallel, delayed
In [ ]:
n = 10
_ = Parallel(n_jobs=8, verbose=2)(delayed(np.sqrt)(np.random.rand(n, n)) for i in range(10000))
In [7]:
np.sqrt(np.random.rand(5, 5))
Out[7]:
In [ ]:
In [ ]:
In [ ]: