In [1]:
%load_ext autoreload

In [2]:
import os
import numpy as np
from tqdm import tqdm
import pickle
import subprocess
import xml.etree.ElementTree as ET

In [3]:
%autoreload
from parsing_utilities import *

In [4]:
input_folder = '/home/fmsnew/Documents/MFGPCpapers/papers_pdfs'
output_folder = '/home/fmsnew/Documents/MFGPCpapers/papers_processed'

In [7]:
parse_pdfs(input_folder, output_folder)

In [8]:
papers_data = {}
exceptions = []
# not recursive
for filename in os.listdir(input_folder):
    if filename[-4:] == '.pdf':
        try:
            name = filename[:-4]
            header_path = os.path.join(output_folder, name + '.tei.xml')
            references_path = os.path.join(output_folder, name + '.references.tei.xml')
            papers_data[name] = {'metadata':get_paper_metadata(header_path), 
                                'references':get_paper_references(references_path)}
        except KeyboardInterrupt:
            raise
        except Exception as e: # for some reason failes to process files with names containing commas ','
            exceptions.append(name)
            print(e, name)

In [9]:
pickle.dump(papers_data, open('../papers_data_mfgpc.pkl', 'wb'))

In [5]:
from joblib import Parallel, delayed

In [ ]:
n = 10
_ = Parallel(n_jobs=8, verbose=2)(delayed(np.sqrt)(np.random.rand(n, n)) for i in range(10000))


[Parallel(n_jobs=8)]: Done 106 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 1316 tasks      | elapsed:    3.5s
[Parallel(n_jobs=8)]: Done 3346 tasks      | elapsed:    7.9s
[Parallel(n_jobs=8)]: Done 6176 tasks      | elapsed:   14.3s

In [7]:
np.sqrt(np.random.rand(5, 5))


Out[7]:
array([[0.63277312, 0.10675245, 0.44715546, 0.92566271, 0.77037945],
       [0.38817485, 0.81593073, 0.64905691, 0.70311537, 0.1644966 ],
       [0.58596323, 0.8827905 , 0.99689745, 0.57015255, 0.6901666 ],
       [0.20012015, 0.6713173 , 0.98971789, 0.65918433, 0.41014242],
       [0.79788066, 0.96432143, 0.97179256, 0.45194911, 0.2986172 ]])

In [ ]:


In [ ]:


In [ ]: