In [ ]:

How to use the models we trained



In [1]:

    
# !pip install -U opencv-python tabula-py pdfquery



In [1]:

    
%load_ext autoreload
%autoreload 2

import os, numpy, glob, collections, random, \
    shutil, pandas, time, subprocess, itertools, \
    cv2, tempfile, scipy, pdfquery, lxml.etree, json, traceback
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]= "1"

from PIL import Image
from IPython.display import display, SVG
from joblib import Parallel, delayed

import matplotlib.pyplot as plt
%pylab inline

import model
from model import *
from prepare_images_utils import *
from train_test_augment import augment_image_deterministic, WINDOW_SIZE
from make_inout_pairs import get_stroke, convert_svg

import tabula

import warnings
warnings.filterwarnings('ignore')

pandas.set_option('max_columns', 100)









    



Populating the interactive namespace from numpy and matplotlib






    



/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['random']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
Using TensorFlow backend.



In [2]:

    
# segmentation_model = load_model('./models/contours.h5', custom_objects=dict(dice_coef_loss=dice_coef_loss))
segmentation_model = load_model('./models/joint1.h5', custom_objects=model.__dict__)



In [11]:

    
# Network outputs:
# (Footer, Header, Body) - hbf
# (Text - Header+Footer, Body) - bt

DENSITY=100
PIXELS_TO_POINTS_FACTOR = 72.0 / DENSITY
POINTS_TO_PIXELS_FACTOR = DENSITY / 72.0

def pdf_to_pages(in_file, out_dir, pages=None):
    if pages is None:
        subprocess.check_call(['convert',
                               '-define', 'pdf:use-cropbox=true',
                               '-density', str(DENSITY),
                               in_file,
                               '-sharpen', '0x1.0',
#                                '-resample', '{0}x{0}'.format(DENSITY),
                               os.path.join(out_dir, '%04d.png')])
    else:
        for page in pages:
            subprocess.check_call(['convert',
                                   '-define', 'pdf:use-cropbox=true',
                                   '-density', str(DENSITY),
                                   '{}[{}]'.format(in_file, page),
                                   '-sharpen', '0x1.0',
#                                    '-resample', '{0}x{0}'.format(DENSITY),
                                   os.path.join(out_dir, '{:04d}.png'.format(page))])
    result = list(glob.glob(os.path.join(out_dir, '*.png')))
    result.sort()
    return result


def make_demo_mask(page_image, boxes):
    demo_mask = numpy.zeros((page_image.size[1], page_image.size[0], 3),
                            dtype='uint8')
    for channel, box in boxes:
        y1, x1, y2, x2 = box
        color = [0] * 3
        color[channel] = 255
        cv2.drawContours(demo_mask,
                         [numpy.array([(x1, y1),
                                       (x2, y1),
                                       (x2, y2),
                                       (x1, y2)])],
                         -1,
                         tuple(color),
                         cv2.FILLED)

    demo_mask = arr_to_img(demo_mask.astype('float32') / 255.0)
    demo_mask_blended = Image.blend(page_image.convert('RGB'), demo_mask, 0.5)
    return demo_mask_blended


def process_page(segm_model, full_image_fname, out_dir, mask_threshold=0.5, min_contour_area=500, read_mode='L', read_proc=identity, take_channels=2):
    transformations = list(augment_image_deterministic(full_image_fname, out_dir,
                                                       scales=[(1, 1)],
                                                       rotations=[0]))
    transformed_images = numpy.expand_dims(read_images_to_tensor([f for _, _, _, _, _, f in transformations],
                                                                 mode=read_mode,
                                                                 proc=read_proc),
                                           -1)

    partial_masks = segm_model.predict(transformed_images)
    if isinstance(partial_masks, list):
        partial_masks = partial_masks[0]

    page_image = load_image_opaque(full_image_fname, mode='L')
#     print('page size', page_image.size)

    aggregated_mask = numpy.zeros((page_image.size[1], page_image.size[0], partial_masks.shape[-1]),
                                  dtype='float32')
    norm = numpy.zeros_like(aggregated_mask)
    for (scale_x, scale_y, x_off, y_off, angle, _), mask_pixels in zip(transformations, partial_masks):
        mask_width = min(WINDOW_SIZE[0], aggregated_mask.shape[1] - x_off)
        mask_height = min(WINDOW_SIZE[1], aggregated_mask.shape[0] - y_off)

        mask = arr_to_img(mask_pixels)
        mask = mask.rotate(-angle).resize((mask_width, mask_height), Image.BILINEAR)
        mask_pixels = numpy.array(mask)
        aggregated_mask[y_off:y_off+mask_pixels.shape[1],
                        x_off:x_off+mask_pixels.shape[0]] += mask_pixels

        norm[y_off:y_off+mask_pixels.shape[1],
             x_off:x_off+mask_pixels.shape[0], :] += 1
    aggregated_mask = numpy.nan_to_num(aggregated_mask / norm) / 255
    nan_mask = numpy.where(aggregated_mask.sum(-1) < 1e-3)
    aggregated_mask[nan_mask[0], nan_mask[1], -1] = 1

#     display(arr_to_img(aggregated_mask))

    # renormalize with softmax
    aggregated_mask = numpy.exp(aggregated_mask)
    aggregated_mask /= numpy.expand_dims(aggregated_mask.sum(-1), -1)
    
#     display(arr_to_img(aggregated_mask))
    
    # binarize by argmax
    max_idx = aggregated_mask.argmax(axis=-1)
    aggregated_mask_bin = numpy.zeros_like(aggregated_mask)
    for channel in range(aggregated_mask.shape[-1]):
        mask = numpy.where(max_idx == channel)
        aggregated_mask_bin[mask[0], mask[1], channel] = 1
#     display(arr_to_img(aggregated_mask_bin))

#     aggregated_mask = binarize_tensor(aggregated_mask, mask_threshold * 255.0)

    contours = [(channel,
                 cv2.findContours((aggregated_mask_bin[:, :, channel]).astype('uint8'),
                                  cv2.RETR_LIST,
                                  cv2.CHAIN_APPROX_SIMPLE)[1])
                for channel in range(take_channels)]

    result = []
    boxes_for_demo_mask = []
    for channel, cur_contours in contours:
        for cnt in cur_contours:
            if cv2.contourArea(cnt) < min_contour_area:
                continue
            x, y, w, h = cv2.boundingRect(cnt)
            result.append((channel, numpy.array([y, x, y+h, x+w]) * PIXELS_TO_POINTS_FACTOR))
            boxes_for_demo_mask.append((channel, numpy.array([y, x, y+h, x+w])))

#     print(result)
    demo_mask_blended = make_demo_mask(page_image, boxes_for_demo_mask)
#     display(demo_mask_blended)

    return result, demo_mask_blended


def extract_table_with_tabula(pdf_file, page_no, box):
    return tabula.read_pdf(pdf_file, guess=False, pages=page_no, area=tuple(box))


# CORNDER_COORDS_IDX = [(0, 1), (0, 3), (2, 1), (2, 3)]
# def box_to_corner_coords(box):
#     return [(box[i], box[j]) for i, j in CORNDER_COORDS_IDX]


# def two_box_distance(a, b):
#     point_pairs = itertools.product(box_to_corner_coords(a),
#                                     box_to_corner_coords(b))
#     distances = map(scipy.spatial.distance.euclidean, point_pairs)
#     return min(distances)


def rect_distance(a, b):
    y1, x1, y1b, x1b = a
    y2, x2, y2b, x2b = b
    left = x2b < x1
    right = x1b < x2
    bottom = y2b < y1
    top = y1b < y2
    if top and left:
        return scipy.spatial.distance.euclidean((x1, y1b), (x2b, y2))
    elif left and bottom:
        return scipy.spatial.distance.euclidean((x1, y1), (x2b, y2b))
    elif bottom and right:
        return scipy.spatial.distance.euclidean((x1b, y1), (x2, y2b))
    elif right and top:
        return scipy.spatial.distance.euclidean((x1b, y1b), (x2, y2))
    elif left:
        return x1 - x2b
    elif right:
        return x2 - x1b
    elif bottom:
        return y1 - y2b
    elif top:
        return y2 - y1b
    else:             # rectangles intersect
        return 0.


def convert_coords_to_pq(box, cropbox):
    ul_y, ul_x, br_y, br_x = box
    x_off, _, _, page_height = cropbox
    return numpy.array([ul_x + x_off, page_height - br_y, br_x + x_off, page_height - ul_y])


def convert_coords_from_pq(box, cropbox):
    bl_x, bl_y, ur_x, ur_y = box
    x_off, _, _, page_height = cropbox
    return numpy.array([page_height - ur_y, bl_x - x_off, page_height - bl_y, ur_x - x_off])


TableInfo = collections.namedtuple('TableInfo',
                                   'page surrounding_text_boxes surrounding_texts body_box body success'.split(' '))


def process_pdf(segm_model, in_file, tmp_dir_prefix='/tmp', return_only_successful=True, pad=2, pages=None, max_text_distance=20, min_table_cols=2, min_table_rows=2):
    result = []
    parsed_pdf = pdfquery.PDFQuery(in_file, parse_tree_cacher=pdfquery.cache.FileCache("/tmp/"), laparams=None)
    parsed_pdf.load()
    pdf_basename = os.path.splitext(os.path.basename(in_file))[0]
    with tempfile.TemporaryDirectory(dir=tmp_dir_prefix) as wd:
        page_filenames = pdf_to_pages(in_file, wd, pages=pages)

        for page_fname in page_filenames:
            page_i = int(os.path.splitext(os.path.basename(page_fname))[0])
            page = parsed_pdf.get_page(page_i)
#             print('id', page_i, page.pageid)
#             page_crop = page.cropbox
#             offsets = numpy.array([-page_crop[1] - pad, -page_crop[0] - pad, -page_crop[1] + pad, -page_crop[0] + pad])
            offsets = numpy.array([-pad, -pad, pad, pad])

            boxes, mask_blended = process_page(segm_model, page_fname, wd, mask_threshold=0.5)

            mask_blended.save('/notebook/papers/nn/pages/{}_{:04d}.png'.format(pdf_basename, page_i))

            this_page_result = []

            bodies = [box + offsets for channel, box in boxes if channel == 1]
            for body_box in bodies:
                body = extract_table_with_tabula(in_file, page_i + 1, body_box)
                table_info = TableInfo(page_i,
                                       [],
                                       [],
                                       body_box,
                                       body,
                                       (not body is None) and body.shape[0] >= min_table_rows and body.shape[1] >= min_table_cols)
                if table_info.success or not return_only_successful:
                    this_page_result.append(table_info)

#                 if table_info.success:
#                     display(body)

            if len(this_page_result) > 0:
                surrounding_texts = [box + offsets for channel, box in boxes if channel == 0]
                for text_box in surrounding_texts:
#                     print('src box', text_box)
#                     print('src conv', convert_coords_to_pq(text_box, page.cropbox))
                    text_query = 'LTPage[pageid="{}"] :in_bbox("{},{},{},{}")'.format(page_i + 1,
                                                                                      *convert_coords_to_pq(text_box,
                                                                                                            page.cropbox))
#                     print(text_query)
                    query_res = parsed_pdf.pq(text_query)
                    text = query_res.text()
                    if not text:
                        continue
#                     print('found box', query_res[0].layout.bbox)
#                     print('conv back', convert_coords_from_pq(query_res[0].layout.bbox, page.cropbox))
#                     print('cropbox', page.cropbox)
#                     print(text)
                    distances_to_tables = [rect_distance(text_box, table.body_box)
                                           for table in this_page_result]
                    closest = numpy.argmin(distances_to_tables)
                    if distances_to_tables[closest] < max_text_distance:
                        table = this_page_result[closest]
                        table.surrounding_text_boxes.append(text_box)
                        table.surrounding_texts.append(text)

            result.extend(this_page_result)
    return result


def pdffigures_format_rect_from_dict(rect_info):
    return numpy.array([rect_info['y1'], rect_info['x1'], rect_info['y2'], rect_info['x2']])


def pf2_detect_tables(in_file, tmp_dir_prefix='/tmp'):
    with tempfile.TemporaryDirectory(dir=tmp_dir_prefix) as wd:
        subprocess.check_call(['sbt',
                               "run-main org.allenai.pdffigures2.FigureExtractorBatchCli {0} -e -m {1} -d {1}".format(in_file, wd)],
                              cwd='/notebook/pdffigures2')

        result_by_page = collections.defaultdict(list)
        for res_file in glob.glob(os.path.join(wd, '*.json')):
            with open(res_file, 'r') as f:
                parse_res = json.load(f)
            for table_info in parse_res:
                if table_info['figType'] != 'Table':
                    continue
                page_no = table_info['page']
                result_by_page[page_no].append((0, pdffigures_format_rect_from_dict(table_info['captionBoundary'])))
                result_by_page[page_no].append((1, pdffigures_format_rect_from_dict(table_info['regionBoundary'])))
        return result_by_page


def process_pdf_pf2(segm_model, in_file, tmp_dir_prefix='/tmp', return_only_successful=True, pad=2, pages=None, max_text_distance=20, min_table_cols=2, min_table_rows=2):
    result = []
    parsed_pdf = pdfquery.PDFQuery(in_file, parse_tree_cacher=pdfquery.cache.FileCache("/tmp/"), laparams=None)
    parsed_pdf.load()
    pdf_basename = os.path.splitext(os.path.basename(in_file))[0]
    with tempfile.TemporaryDirectory(dir=tmp_dir_prefix) as wd:
        page_filenames = pdf_to_pages(in_file, wd, pages=pages)
        boxes_by_pages = pf2_detect_tables(in_file, tmp_dir_prefix=wd)

        for page_fname in page_filenames:
            page_i = int(os.path.splitext(os.path.basename(page_fname))[0])
            page = parsed_pdf.get_page(page_i)
            offsets = numpy.array([-pad, -pad, pad, pad])

            boxes = boxes_by_pages.get(page_i, [])
            boxes_for_display = [(ch, box * POINTS_TO_PIXELS_FACTOR) for ch, box in boxes]
            page_image = load_image_opaque(page_fname, mode='L')
            mask_blended = make_demo_mask(page_image, boxes_for_display)
            mask_blended.save('/notebook/papers/pf/pages/{}_{:04d}.png'.format(pdf_basename, page_i))

            this_page_result = []

            bodies = [box + offsets for channel, box in boxes if channel == 1]
            for body_box in bodies:
                body = extract_table_with_tabula(in_file, page_i + 1, body_box)
                table_info = TableInfo(page_i,
                                       [],
                                       [],
                                       body_box,
                                       body,
                                       (not body is None) and body.shape[0] >= min_table_rows and body.shape[1] >= min_table_cols)
                if table_info.success or not return_only_successful:
                    this_page_result.append(table_info)

#                 if table_info.success:
#                     display(body)

            if len(this_page_result) > 0:
                surrounding_texts = [box + offsets for channel, box in boxes if channel == 0]
                for text_box in surrounding_texts:
#                     print('src box', text_box)
#                     print('src conv', convert_coords_to_pq(text_box, page.cropbox))
                    text_query = 'LTPage[pageid="{}"] :in_bbox("{},{},{},{}")'.format(page_i + 1,
                                                                                      *convert_coords_to_pq(text_box,
                                                                                                            page.cropbox))
#                     print(text_query)
                    query_res = parsed_pdf.pq(text_query)
                    text = query_res.text()
                    if not text:
                        continue
#                     print('found box', query_res[0].layout.bbox)
#                     print('conv back', convert_coords_from_pq(query_res[0].layout.bbox, page.cropbox))
#                     print('cropbox', page.cropbox)
#                     print(text)
                    distances_to_tables = [rect_distance(text_box, table.body_box)
                                           for table in this_page_result]
                    closest = numpy.argmin(distances_to_tables)
                    if distances_to_tables[closest] < max_text_distance:
                        table = this_page_result[closest]
                        table.surrounding_text_boxes.append(text_box)
                        table.surrounding_texts.append(text)

            result.extend(this_page_result)
    return result


def table_info_to_dict(tinfo):
    result = dict(page=tinfo.page,
                  surrounding_text_boxes=[b.tolist() for b in tinfo.surrounding_text_boxes] or [],
                  surrounding_texts=tinfo.surrounding_texts,
                  body_box=tinfo.body_box.tolist(),
                  success=bool(int(tinfo.success or False)))
    if tinfo.success:
        result['body'] = json.loads(tinfo.body.to_json(orient='split'), encoding='utf8')
    return result


def process_pdfs(model, files, *args, **kwargs):
    result = {}
    for file in files:
        result[file] = process_pdf(model, file, *args, **kwargs)
    return result



In [4]:

    
# with open('./mwh_docs/dump_2015_10_23_done_no_norm.json', 'r') as f:
#     dc_markup = json.load(f)

# dc_docs = set(os.path.basename(d['fields']['source_file']) for d in dc_markup if 'source_file' in d['fields'] and d['fields']['subject'] == 1)

# for fname in dc_docs:
#     try:
#         shutil.copy2(os.path.join('./mwh_docs/src/', fname), './mwh_docs/dc/')
#     except:
#         print(fname)

Vis



In [5]:

    
# %%prun -s cumulative
# process_pdfs(segmentation_model, glob.glob('/notebook/data/tmp/11955277.pdf'))

Vis2



In [5]:

    
# Image.fromarray(numpy.squeeze(visualize_activation(segmentation_model, 31)))



In [6]:

    
# for i, layer in enumerate(segmentation_model.layers):
#     print(i, layer)

Apply to PDFs from ./papers

Our model as detector



In [7]:

    
for fname in glob.glob('/notebook/papers/*.pdf'):
    try:
        print('Processing {}'.format(fname))
        doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
        serial_res = [table_info_to_dict(tinfo) for tinfo in doc_res]

        doc_basename = os.path.splitext(os.path.basename(fname))[0]
        with open('/notebook/papers/nn/tables/{}.json'.format(doc_basename), 'w') as f:
            json.dump(serial_res, f, indent=True)

        print('Successfully processed {}'.format(fname))
    except KeyboardInterrupt:
        break
    except:
        print('Could not process {}'.format(fname))
        print(traceback.format_exc())
        print()









    



Processing /notebook/papers/2007-8.pdf
Successfully processed /notebook/papers/2007-8.pdf
Processing /notebook/papers/2003-13.pdf
Successfully processed /notebook/papers/2003-13.pdf
Processing /notebook/papers/2007-3.pdf
Successfully processed /notebook/papers/2007-3.pdf
Processing /notebook/papers/2009-2.pdf
Successfully processed /notebook/papers/2009-2.pdf
Processing /notebook/papers/2012-4.pdf
Successfully processed /notebook/papers/2012-4.pdf
Processing /notebook/papers/2003-1.pdf
Successfully processed /notebook/papers/2003-1.pdf
Processing /notebook/papers/2003-3.pdf
Successfully processed /notebook/papers/2003-3.pdf
Processing /notebook/papers/2003-18.pdf
Successfully processed /notebook/papers/2003-18.pdf
Processing /notebook/papers/2003-12.pdf
Could not process /notebook/papers/2003-12.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 191, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2014-1.pdf
Could not process /notebook/papers/2014-1.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 191, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2001-10-dendritic-cell-based-xenoantigen-vaccination-for-prostate-cancer-immunotherapy.pdf
Successfully processed /notebook/papers/2001-10-dendritic-cell-based-xenoantigen-vaccination-for-prostate-cancer-immunotherapy.pdf
Processing /notebook/papers/2001-15-in-vivo-transfection-and-or-cross-priming-of-dendritic-cells-following-dna-and-adenoviral-immunizations-for-immunotherapy-of-cancer.pdf
Successfully processed /notebook/papers/2001-15-in-vivo-transfection-and-or-cross-priming-of-dendritic-cells-following-dna-and-adenoviral-immunizations-for-immunotherapy-of-cancer.pdf
Processing /notebook/papers/2012-3.pdf
Successfully processed /notebook/papers/2012-3.pdf
Processing /notebook/papers/2002-8-treatment-of-non-resectable-hepatocellular-carcinoma-with-autologous-tumor-pulsed-dendritic-cells.pdf
Successfully processed /notebook/papers/2002-8-treatment-of-non-resectable-hepatocellular-carcinoma-with-autologous-tumor-pulsed-dendritic-cells.pdf
Processing /notebook/papers/2007-11.pdf
Successfully processed /notebook/papers/2007-11.pdf
Processing /notebook/papers/2011-11.pdf
Successfully processed /notebook/papers/2011-11.pdf
Processing /notebook/papers/2008-6.pdf
Successfully processed /notebook/papers/2008-6.pdf
Processing /notebook/papers/2014-3.pdf
Successfully processed /notebook/papers/2014-3.pdf
Processing /notebook/papers/2000-18-treatment-of-solid-tumours-in-children-with-tumour-lysate-pulsed-dendritic-cells.pdf
Successfully processed /notebook/papers/2000-18-treatment-of-solid-tumours-in-children-with-tumour-lysate-pulsed-dendritic-cells.pdf
Processing /notebook/papers/2003-11.pdf
Successfully processed /notebook/papers/2003-11.pdf
Processing /notebook/papers/2009-4.pdf
Successfully processed /notebook/papers/2009-4.pdf
Processing /notebook/papers/2001-12-immune-and-clinical-responses-in-patients-with-metastatic-melanoma-to-cd34-progenitor-derived-dendritic-cell-vaccine.pdf
Successfully processed /notebook/papers/2001-12-immune-and-clinical-responses-in-patients-with-metastatic-melanoma-to-cd34-progenitor-derived-dendritic-cell-vaccine.pdf
Processing /notebook/papers/2011-3.pdf
Successfully processed /notebook/papers/2011-3.pdf
Processing /notebook/papers/2001-16-regression-of-lymph-node-metastases-by-immunotherapy-using-autologous-breast-tumor-lysate-pulsed-dendritic-cells-report-of-a-case.pdf
Successfully processed /notebook/papers/2001-16-regression-of-lymph-node-metastases-by-immunotherapy-using-autologous-breast-tumor-lysate-pulsed-dendritic-cells-report-of-a-case.pdf
Processing /notebook/papers/2004-1.pdf
Successfully processed /notebook/papers/2004-1.pdf
Processing /notebook/papers/2010-1.pdf
Successfully processed /notebook/papers/2010-1.pdf
Processing /notebook/papers/2007-2.pdf
Successfully processed /notebook/papers/2007-2.pdf
Processing /notebook/papers/2000-11-immunotherapy-of-hormone-refractory-prostate-cancer-with-antigen-loaded-dendritic-cells.pdf
Successfully processed /notebook/papers/2000-11-immunotherapy-of-hormone-refractory-prostate-cancer-with-antigen-loaded-dendritic-cells.pdf
Processing /notebook/papers/2003-16.pdf
Successfully processed /notebook/papers/2003-16.pdf
Processing /notebook/papers/2011-4.pdf
Successfully processed /notebook/papers/2011-4.pdf
Processing /notebook/papers/2002-1-a-phase-i-trial-of-tumor-lysate-pulsed-dendritic-cells-in-the-treatment-of-advanced-cancer.pdf
Successfully processed /notebook/papers/2002-1-a-phase-i-trial-of-tumor-lysate-pulsed-dendritic-cells-in-the-treatment-of-advanced-cancer.pdf
Processing /notebook/papers/2003-5.pdf
Successfully processed /notebook/papers/2003-5.pdf
Processing /notebook/papers/2010-2.pdf
Successfully processed /notebook/papers/2010-2.pdf
Processing /notebook/papers/2003-8.pdf
Successfully processed /notebook/papers/2003-8.pdf
Processing /notebook/papers/2012-2.pdf
Successfully processed /notebook/papers/2012-2.pdf
Processing /notebook/papers/2009-6.pdf
Successfully processed /notebook/papers/2009-6.pdf
Processing /notebook/papers/2003-17.pdf
Successfully processed /notebook/papers/2003-17.pdf
Processing /notebook/papers/2001-14-immunotherapy-of-bladder-cancer-using-autologous-dendritic-cells-pulsed-with-human-lymphocyte-antigen-a24-specific-mage-3-peptide.pdf
Successfully processed /notebook/papers/2001-14-immunotherapy-of-bladder-cancer-using-autologous-dendritic-cells-pulsed-with-human-lymphocyte-antigen-a24-specific-mage-3-peptide.pdf
Processing /notebook/papers/2004-3.pdf
Successfully processed /notebook/papers/2004-3.pdf
Processing /notebook/papers/2001-13-immunotherapy-for-medullary-thyroid-carcinoma-by-dendritic-cell-vaccination.pdf
Successfully processed /notebook/papers/2001-13-immunotherapy-for-medullary-thyroid-carcinoma-by-dendritic-cell-vaccination.pdf
Processing /notebook/papers/2011-5.pdf
Successfully processed /notebook/papers/2011-5.pdf
Processing /notebook/papers/2011-8.pdf
Successfully processed /notebook/papers/2011-8.pdf
Processing /notebook/papers/2003-15.pdf
Successfully processed /notebook/papers/2003-15.pdf
Processing /notebook/papers/2007-12.pdf
Successfully processed /notebook/papers/2007-12.pdf
Processing /notebook/papers/2011-10.pdf
Successfully processed /notebook/papers/2011-10.pdf
Processing /notebook/papers/2009-7.pdf
Successfully processed /notebook/papers/2009-7.pdf
Processing /notebook/papers/2002-2-dendritic-cell-immunotherapy-for-patients-with-metastatic-renal-cell-carcinoma-university-of-tokyo-experience.pdf
Successfully processed /notebook/papers/2002-2-dendritic-cell-immunotherapy-for-patients-with-metastatic-renal-cell-carcinoma-university-of-tokyo-experience.pdf
Processing /notebook/papers/2002-4-immunotherapy-of-solid-cancer-using-dendritic-cells-pulsed-with-the-hla-a24-restricted-peptide-of-carcinoembryonic-antigen.pdf
Successfully processed /notebook/papers/2002-4-immunotherapy-of-solid-cancer-using-dendritic-cells-pulsed-with-the-hla-a24-restricted-peptide-of-carcinoembryonic-antigen.pdf
Processing /notebook/papers/2007-5.pdf
Successfully processed /notebook/papers/2007-5.pdf
Processing /notebook/papers/2011-2.pdf
Successfully processed /notebook/papers/2011-2.pdf
Processing /notebook/papers/2003-7.pdf
Successfully processed /notebook/papers/2003-7.pdf
Processing /notebook/papers/2007-4.pdf
Successfully processed /notebook/papers/2007-4.pdf
Processing /notebook/papers/2002-7-the-feasibility-and-safety-of-immunotherapy-with-dendritic-cells-loaded-with-cea-mrna-following-neoadjuvant-chemoradiotherapy-and-resection-of-pancreatic-cancer.pdf
Successfully processed /notebook/papers/2002-7-the-feasibility-and-safety-of-immunotherapy-with-dendritic-cells-loaded-with-cea-mrna-following-neoadjuvant-chemoradiotherapy-and-resection-of-pancreatic-cancer.pdf
Processing /notebook/papers/2008-5.pdf
Could not process /notebook/papers/2008-5.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 194, in process_pdf
    page_filenames = pdf_to_pages(in_file, wd, pages=pages)
  File "<ipython-input-6-e3106961fe37>", line 17, in pdf_to_pages
    os.path.join(out_dir, '%04d.png')])
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/subprocess.py", line 291, in check_call
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['convert', '-define', 'pdf:use-cropbox=true', '-density', '100', '/notebook/papers/2008-5.pdf', '-sharpen', '0x1.0', '/tmp/tmpz6qyvqe6/%04d.png']' returned non-zero exit status 1.


Processing /notebook/papers/2011-12.pdf
Could not process /notebook/papers/2011-12.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 212, in process_pdf
    body = extract_table_with_tabula(in_file, page_i + 1, body_box)
  File "<ipython-input-6-e3106961fe37>", line 130, in extract_table_with_tabula
    return tabula.read_pdf(pdf_file, guess=False, pages=page_no, area=tuple(box))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/tabula/wrapper.py", line 97, in read_pdf
    return pd.read_csv(io.BytesIO(output), **pandas_options)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 655, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 411, in _read
    data = parser.read(nrows)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 1005, in read
    ret = self._engine.read(nrows)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 1748, in read
    data = self._reader.read(nrows)
  File "pandas/_libs/parsers.pyx", line 890, in pandas._libs.parsers.TextReader.read (pandas/_libs/parsers.c:10862)
  File "pandas/_libs/parsers.pyx", line 912, in pandas._libs.parsers.TextReader._read_low_memory (pandas/_libs/parsers.c:11138)
  File "pandas/_libs/parsers.pyx", line 966, in pandas._libs.parsers.TextReader._read_rows (pandas/_libs/parsers.c:11884)
  File "pandas/_libs/parsers.pyx", line 953, in pandas._libs.parsers.TextReader._tokenize_rows (pandas/_libs/parsers.c:11755)
  File "pandas/_libs/parsers.pyx", line 2184, in pandas._libs.parsers.raise_parser_error (pandas/_libs/parsers.c:28765)
pandas.errors.ParserError: Error tokenizing data. C error: Expected 2 fields in line 3, saw 3



Processing /notebook/papers/2003-9.pdf
Successfully processed /notebook/papers/2003-9.pdf
Processing /notebook/papers/2008-4.pdf
Successfully processed /notebook/papers/2008-4.pdf
Processing /notebook/papers/2004-2.pdf
Successfully processed /notebook/papers/2004-2.pdf
Processing /notebook/papers/2014-2.pdf
Successfully processed /notebook/papers/2014-2.pdf
Processing /notebook/papers/2009-5.pdf
Successfully processed /notebook/papers/2009-5.pdf
Processing /notebook/papers/2007-7.pdf
Could not process /notebook/papers/2007-7.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 191, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2011-6.pdf
Successfully processed /notebook/papers/2011-6.pdf
Processing /notebook/papers/2002-5-mucin-gene-muc1-transfected-dendritic-cells-as-vaccine-results-of-a-phase-i-ii-clinical-trial.pdf
Successfully processed /notebook/papers/2002-5-mucin-gene-muc1-transfected-dendritic-cells-as-vaccine-results-of-a-phase-i-ii-clinical-trial.pdf
Processing /notebook/papers/2002-3-immunotherapy-of-metastatic-renal-cell-carcinoma-with-tumor-lysate-pulsed-autologous-dendritic-cells.pdf
Successfully processed /notebook/papers/2002-3-immunotherapy-of-metastatic-renal-cell-carcinoma-with-tumor-lysate-pulsed-autologous-dendritic-cells.pdf
Processing /notebook/papers/2013-1.pdf
Successfully processed /notebook/papers/2013-1.pdf
Processing /notebook/papers/2007-9.pdf
Successfully processed /notebook/papers/2007-9.pdf
Processing /notebook/papers/2011-9.pdf
Successfully processed /notebook/papers/2011-9.pdf
Processing /notebook/papers/2011-1.pdf
Successfully processed /notebook/papers/2011-1.pdf
Processing /notebook/papers/2002-6-optimizing-dendritic-cell-based-immunotherapy-in-multiple-myeloma.pdf
Successfully processed /notebook/papers/2002-6-optimizing-dendritic-cell-based-immunotherapy-in-multiple-myeloma.pdf
Processing /notebook/papers/2001-9-altered-peptide-ligand-vaccination-with-flt3-ligand-expanded-dendritic-cells-for-tumor-immunotherapy.pdf
Successfully processed /notebook/papers/2001-9-altered-peptide-ligand-vaccination-with-flt3-ligand-expanded-dendritic-cells-for-tumor-immunotherapy.pdf
Processing /notebook/papers/2007-6.pdf
Successfully processed /notebook/papers/2007-6.pdf
Processing /notebook/papers/2000-17-phase-i-study-in-melanoma-patients-of-a-vaccine-with-peptide-pulsed-dendritic-cells-generated-in-vitro-from-cd34-hematopoietic-progenitor-cells.pdf
Successfully processed /notebook/papers/2000-17-phase-i-study-in-melanoma-patients-of-a-vaccine-with-peptide-pulsed-dendritic-cells-generated-in-vitro-from-cd34-hematopoietic-progenitor-cells.pdf
Processing /notebook/papers/2012-1.pdf
Could not process /notebook/papers/2012-1.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 191, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2008-1.pdf
Successfully processed /notebook/papers/2008-1.pdf
Processing /notebook/papers/2003-6.pdf
Successfully processed /notebook/papers/2003-6.pdf
Processing /notebook/papers/2008-3.pdf
Successfully processed /notebook/papers/2008-3.pdf
Processing /notebook/papers/2007-1-unlocked-by-wwwfreemypdfcom.pdf
Successfully processed /notebook/papers/2007-1-unlocked-by-wwwfreemypdfcom.pdf
Processing /notebook/papers/2003-2.pdf
Successfully processed /notebook/papers/2003-2.pdf

Pdffigures2 as detector



In [12]:

    
for fname in glob.glob('/notebook/papers/*.pdf'):
    try:
        print('Processing {}'.format(fname))
        doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
        serial_res = [table_info_to_dict(tinfo) for tinfo in doc_res]

        doc_basename = os.path.splitext(os.path.basename(fname))[0]
        with open('/notebook/papers/pf/tables/{}.json'.format(doc_basename), 'w') as f:
            json.dump(serial_res, f, indent=True)

        print('Successfully processed {}'.format(fname))
    except KeyboardInterrupt:
        break
    except:
        print('Could not process {}'.format(fname))
        print(traceback.format_exc())
        print()









    



Processing /notebook/papers/2007-8.pdf
Successfully processed /notebook/papers/2007-8.pdf
Processing /notebook/papers/2003-13.pdf
Successfully processed /notebook/papers/2003-13.pdf
Processing /notebook/papers/2007-3.pdf
Successfully processed /notebook/papers/2007-3.pdf
Processing /notebook/papers/2009-2.pdf
Successfully processed /notebook/papers/2009-2.pdf
Processing /notebook/papers/2012-4.pdf
Successfully processed /notebook/papers/2012-4.pdf
Processing /notebook/papers/2003-1.pdf
Successfully processed /notebook/papers/2003-1.pdf
Processing /notebook/papers/2003-3.pdf
Successfully processed /notebook/papers/2003-3.pdf
Processing /notebook/papers/2003-18.pdf
Successfully processed /notebook/papers/2003-18.pdf
Processing /notebook/papers/2003-12.pdf
Could not process /notebook/papers/2003-12.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 280, in process_pdf_pf2
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2014-1.pdf
Could not process /notebook/papers/2014-1.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 280, in process_pdf_pf2
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2001-10-dendritic-cell-based-xenoantigen-vaccination-for-prostate-cancer-immunotherapy.pdf
Successfully processed /notebook/papers/2001-10-dendritic-cell-based-xenoantigen-vaccination-for-prostate-cancer-immunotherapy.pdf
Processing /notebook/papers/2001-15-in-vivo-transfection-and-or-cross-priming-of-dendritic-cells-following-dna-and-adenoviral-immunizations-for-immunotherapy-of-cancer.pdf
Successfully processed /notebook/papers/2001-15-in-vivo-transfection-and-or-cross-priming-of-dendritic-cells-following-dna-and-adenoviral-immunizations-for-immunotherapy-of-cancer.pdf
Processing /notebook/papers/2012-3.pdf
Successfully processed /notebook/papers/2012-3.pdf
Processing /notebook/papers/2002-8-treatment-of-non-resectable-hepatocellular-carcinoma-with-autologous-tumor-pulsed-dendritic-cells.pdf
Successfully processed /notebook/papers/2002-8-treatment-of-non-resectable-hepatocellular-carcinoma-with-autologous-tumor-pulsed-dendritic-cells.pdf
Processing /notebook/papers/2007-11.pdf
Successfully processed /notebook/papers/2007-11.pdf
Processing /notebook/papers/2011-11.pdf
Successfully processed /notebook/papers/2011-11.pdf
Processing /notebook/papers/2008-6.pdf
Successfully processed /notebook/papers/2008-6.pdf
Processing /notebook/papers/2014-3.pdf
Successfully processed /notebook/papers/2014-3.pdf
Processing /notebook/papers/2000-18-treatment-of-solid-tumours-in-children-with-tumour-lysate-pulsed-dendritic-cells.pdf
Successfully processed /notebook/papers/2000-18-treatment-of-solid-tumours-in-children-with-tumour-lysate-pulsed-dendritic-cells.pdf
Processing /notebook/papers/2003-11.pdf
Successfully processed /notebook/papers/2003-11.pdf
Processing /notebook/papers/2009-4.pdf
Successfully processed /notebook/papers/2009-4.pdf
Processing /notebook/papers/2001-12-immune-and-clinical-responses-in-patients-with-metastatic-melanoma-to-cd34-progenitor-derived-dendritic-cell-vaccine.pdf
Successfully processed /notebook/papers/2001-12-immune-and-clinical-responses-in-patients-with-metastatic-melanoma-to-cd34-progenitor-derived-dendritic-cell-vaccine.pdf
Processing /notebook/papers/2011-3.pdf
Successfully processed /notebook/papers/2011-3.pdf
Processing /notebook/papers/2001-16-regression-of-lymph-node-metastases-by-immunotherapy-using-autologous-breast-tumor-lysate-pulsed-dendritic-cells-report-of-a-case.pdf
Successfully processed /notebook/papers/2001-16-regression-of-lymph-node-metastases-by-immunotherapy-using-autologous-breast-tumor-lysate-pulsed-dendritic-cells-report-of-a-case.pdf
Processing /notebook/papers/2004-1.pdf
Successfully processed /notebook/papers/2004-1.pdf
Processing /notebook/papers/2010-1.pdf
Successfully processed /notebook/papers/2010-1.pdf
Processing /notebook/papers/2007-2.pdf
Successfully processed /notebook/papers/2007-2.pdf
Processing /notebook/papers/2000-11-immunotherapy-of-hormone-refractory-prostate-cancer-with-antigen-loaded-dendritic-cells.pdf
Successfully processed /notebook/papers/2000-11-immunotherapy-of-hormone-refractory-prostate-cancer-with-antigen-loaded-dendritic-cells.pdf
Processing /notebook/papers/2003-16.pdf
Successfully processed /notebook/papers/2003-16.pdf
Processing /notebook/papers/2011-4.pdf
Successfully processed /notebook/papers/2011-4.pdf
Processing /notebook/papers/2002-1-a-phase-i-trial-of-tumor-lysate-pulsed-dendritic-cells-in-the-treatment-of-advanced-cancer.pdf
Successfully processed /notebook/papers/2002-1-a-phase-i-trial-of-tumor-lysate-pulsed-dendritic-cells-in-the-treatment-of-advanced-cancer.pdf
Processing /notebook/papers/2003-5.pdf
Successfully processed /notebook/papers/2003-5.pdf
Processing /notebook/papers/2010-2.pdf
Successfully processed /notebook/papers/2010-2.pdf
Processing /notebook/papers/2003-8.pdf
Successfully processed /notebook/papers/2003-8.pdf
Processing /notebook/papers/2012-2.pdf
Successfully processed /notebook/papers/2012-2.pdf
Processing /notebook/papers/2009-6.pdf
Successfully processed /notebook/papers/2009-6.pdf
Processing /notebook/papers/2003-17.pdf
Successfully processed /notebook/papers/2003-17.pdf
Processing /notebook/papers/2001-14-immunotherapy-of-bladder-cancer-using-autologous-dendritic-cells-pulsed-with-human-lymphocyte-antigen-a24-specific-mage-3-peptide.pdf
Successfully processed /notebook/papers/2001-14-immunotherapy-of-bladder-cancer-using-autologous-dendritic-cells-pulsed-with-human-lymphocyte-antigen-a24-specific-mage-3-peptide.pdf
Processing /notebook/papers/2004-3.pdf
Successfully processed /notebook/papers/2004-3.pdf
Processing /notebook/papers/2001-13-immunotherapy-for-medullary-thyroid-carcinoma-by-dendritic-cell-vaccination.pdf
Successfully processed /notebook/papers/2001-13-immunotherapy-for-medullary-thyroid-carcinoma-by-dendritic-cell-vaccination.pdf
Processing /notebook/papers/2011-5.pdf
Successfully processed /notebook/papers/2011-5.pdf
Processing /notebook/papers/2011-8.pdf
Successfully processed /notebook/papers/2011-8.pdf
Processing /notebook/papers/2003-15.pdf
Successfully processed /notebook/papers/2003-15.pdf
Processing /notebook/papers/2007-12.pdf
Successfully processed /notebook/papers/2007-12.pdf
Processing /notebook/papers/2011-10.pdf
Successfully processed /notebook/papers/2011-10.pdf
Processing /notebook/papers/2009-7.pdf
Successfully processed /notebook/papers/2009-7.pdf
Processing /notebook/papers/2002-2-dendritic-cell-immunotherapy-for-patients-with-metastatic-renal-cell-carcinoma-university-of-tokyo-experience.pdf
Successfully processed /notebook/papers/2002-2-dendritic-cell-immunotherapy-for-patients-with-metastatic-renal-cell-carcinoma-university-of-tokyo-experience.pdf
Processing /notebook/papers/2002-4-immunotherapy-of-solid-cancer-using-dendritic-cells-pulsed-with-the-hla-a24-restricted-peptide-of-carcinoembryonic-antigen.pdf
Successfully processed /notebook/papers/2002-4-immunotherapy-of-solid-cancer-using-dendritic-cells-pulsed-with-the-hla-a24-restricted-peptide-of-carcinoembryonic-antigen.pdf
Processing /notebook/papers/2007-5.pdf
Successfully processed /notebook/papers/2007-5.pdf
Processing /notebook/papers/2011-2.pdf
Successfully processed /notebook/papers/2011-2.pdf
Processing /notebook/papers/2003-7.pdf
Successfully processed /notebook/papers/2003-7.pdf
Processing /notebook/papers/2007-4.pdf
Successfully processed /notebook/papers/2007-4.pdf
Processing /notebook/papers/2002-7-the-feasibility-and-safety-of-immunotherapy-with-dendritic-cells-loaded-with-cea-mrna-following-neoadjuvant-chemoradiotherapy-and-resection-of-pancreatic-cancer.pdf
Successfully processed /notebook/papers/2002-7-the-feasibility-and-safety-of-immunotherapy-with-dendritic-cells-loaded-with-cea-mrna-following-neoadjuvant-chemoradiotherapy-and-resection-of-pancreatic-cancer.pdf
Processing /notebook/papers/2008-5.pdf
Could not process /notebook/papers/2008-5.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 283, in process_pdf_pf2
    page_filenames = pdf_to_pages(in_file, wd, pages=pages)
  File "<ipython-input-11-a636097a9443>", line 17, in pdf_to_pages
    os.path.join(out_dir, '%04d.png')])
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/subprocess.py", line 291, in check_call
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['convert', '-define', 'pdf:use-cropbox=true', '-density', '100', '/notebook/papers/2008-5.pdf', '-sharpen', '0x1.0', '/tmp/tmp6bqcowzh/%04d.png']' returned non-zero exit status 1.


Processing /notebook/papers/2011-12.pdf
Successfully processed /notebook/papers/2011-12.pdf
Processing /notebook/papers/2003-9.pdf
Successfully processed /notebook/papers/2003-9.pdf
Processing /notebook/papers/2008-4.pdf
Successfully processed /notebook/papers/2008-4.pdf
Processing /notebook/papers/2004-2.pdf
Successfully processed /notebook/papers/2004-2.pdf
Processing /notebook/papers/2014-2.pdf
Successfully processed /notebook/papers/2014-2.pdf
Processing /notebook/papers/2009-5.pdf
Successfully processed /notebook/papers/2009-5.pdf
Processing /notebook/papers/2007-7.pdf
Could not process /notebook/papers/2007-7.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 280, in process_pdf_pf2
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2011-6.pdf
Successfully processed /notebook/papers/2011-6.pdf
Processing /notebook/papers/2002-5-mucin-gene-muc1-transfected-dendritic-cells-as-vaccine-results-of-a-phase-i-ii-clinical-trial.pdf
Successfully processed /notebook/papers/2002-5-mucin-gene-muc1-transfected-dendritic-cells-as-vaccine-results-of-a-phase-i-ii-clinical-trial.pdf
Processing /notebook/papers/2002-3-immunotherapy-of-metastatic-renal-cell-carcinoma-with-tumor-lysate-pulsed-autologous-dendritic-cells.pdf
Successfully processed /notebook/papers/2002-3-immunotherapy-of-metastatic-renal-cell-carcinoma-with-tumor-lysate-pulsed-autologous-dendritic-cells.pdf
Processing /notebook/papers/2013-1.pdf
Successfully processed /notebook/papers/2013-1.pdf
Processing /notebook/papers/2007-9.pdf
Successfully processed /notebook/papers/2007-9.pdf
Processing /notebook/papers/2011-9.pdf
Successfully processed /notebook/papers/2011-9.pdf
Processing /notebook/papers/2011-1.pdf
Successfully processed /notebook/papers/2011-1.pdf
Processing /notebook/papers/2002-6-optimizing-dendritic-cell-based-immunotherapy-in-multiple-myeloma.pdf
Successfully processed /notebook/papers/2002-6-optimizing-dendritic-cell-based-immunotherapy-in-multiple-myeloma.pdf
Processing /notebook/papers/2001-9-altered-peptide-ligand-vaccination-with-flt3-ligand-expanded-dendritic-cells-for-tumor-immunotherapy.pdf
Successfully processed /notebook/papers/2001-9-altered-peptide-ligand-vaccination-with-flt3-ligand-expanded-dendritic-cells-for-tumor-immunotherapy.pdf
Processing /notebook/papers/2007-6.pdf
Successfully processed /notebook/papers/2007-6.pdf
Processing /notebook/papers/2000-17-phase-i-study-in-melanoma-patients-of-a-vaccine-with-peptide-pulsed-dendritic-cells-generated-in-vitro-from-cd34-hematopoietic-progenitor-cells.pdf
Successfully processed /notebook/papers/2000-17-phase-i-study-in-melanoma-patients-of-a-vaccine-with-peptide-pulsed-dendritic-cells-generated-in-vitro-from-cd34-hematopoietic-progenitor-cells.pdf
Processing /notebook/papers/2012-1.pdf
Could not process /notebook/papers/2012-1.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 280, in process_pdf_pf2
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2008-1.pdf
Successfully processed /notebook/papers/2008-1.pdf
Processing /notebook/papers/2003-6.pdf
Successfully processed /notebook/papers/2003-6.pdf
Processing /notebook/papers/2008-3.pdf
Successfully processed /notebook/papers/2008-3.pdf
Processing /notebook/papers/2007-1-unlocked-by-wwwfreemypdfcom.pdf
Successfully processed /notebook/papers/2007-1-unlocked-by-wwwfreemypdfcom.pdf
Processing /notebook/papers/2003-2.pdf
Successfully processed /notebook/papers/2003-2.pdf

Evaluate



In [12]:

    
def rect_color_to_class(color):
    color = color.lower()
    if color in ('ff0000', '00ff00'):
        return 'surrounding_text'
    else:
        return 'body'


def get_svg_rectangles(svg_file):
    rects_by_class = collections.defaultdict(list)
    with open(svg_file, 'r') as f:
        tree = lxml.etree.parse(f)
        for n in list(tree.getroot().getchildren()):
            if n.tag.endswith('rect'):
                stroke = get_stroke(n)
                x = float(n.get('x'))
                y = float(n.get('y'))
                w = float(n.get('width'))
                h = float(n.get('height'))
                rects_by_class[rect_color_to_class(stroke)].append(numpy.array([y, x, y+h, x+w]))
    return rects_by_class


def rect_area(rect):
    y1, x1, y2, x2 = rect
    return (y2 - y1) * (x2 - x1)


def rect_intersection(a, b):
    ay1, ax1, ay2, ax2 = a
    by1, bx1, by2, bx2 = b
    iy1 = max(ay1, by1)
    ix1 = max(ax1, bx1)
    iy2 = min(ay2, by2)
    ix2 = min(ax2, bx2)
    if ix2 - ix1 <= 0 or iy2 - iy1 <= 0:
        return None
    return numpy.array([iy1, ix1, iy2, ix2])


def rect_union(a, b):
    ay1, ax1, ay2, ax2 = a
    by1, bx1, by2, bx2 = b
    return numpy.array([min(ay1, by1),
                        min(ax1, bx1),
                        max(ay2, by2),
                        max(ax2, bx2)])


def rect_dice(a, b):
    intersect = rect_intersection(a, b)
    if not intersect is None:
        return rect_area(intersect) / rect_area(rect_union(a, b))
    else:
        return 0


def calc_page_stat(gold_rects, found_rects, min_tp_dice=0.1):
    fp = 0
    tp = 0
    sum_tp_dice = 0
    full_dice = 0
    found_gr = set()

    for fr in found_rects:
        best_dice = 0
        best_gr_i = None
        for i, gr in enumerate(gold_rects):
            dice = rect_dice(fr, gr)
            if dice > best_dice:
                best_dice = dice
                best_gr_i = i

        if best_gr_i is None or best_dice < min_tp_dice:
            fp += 1
        else:
            tp += 1
            sum_tp_dice += best_dice
            found_gr.add(best_gr_i)
        full_dice += best_dice

    return dict(found_n=len(found_rects),
                gold_n=len(gold_rects),
                fp=fp,
                tp=tp,
                fn=len(gold_rects) - len(found_gr),
                tp_dice=sum_tp_dice,
                full_dice=full_dice)


def eval_on_single_doc(file, gold_rect_getter, rect_getter, min_tp_dice=0.1):
    gold_rects = gold_rect_getter(file)
    dirty_rects = rect_getter(file)
    classes = set(gold_rects.keys()) | set(dirty_rects.keys())
    return pandas.concat([pandas.Series(calc_page_stat(gold_rects.get(cls, []),
                                                       dirty_rects.get(cls, []),
                                                       min_tp_dice=min_tp_dice))
                          for cls in classes],
                         keys=classes)


def aggregate_df_metrics(df):
    result = []
    categories = set(df.columns.get_level_values(0))
    for category in categories:
        cat_metrics = df.xs(category, axis=1, level=0)

        non_empty_docs = cat_metrics[['gold_n', 'found_n']].sum(axis=1) > 0
        cat_metrics = cat_metrics[non_empty_docs]

        cat_metrics.tp_dice /= cat_metrics.tp
        cat_metrics.full_dice /= cat_metrics.found_n
        cat_metrics.fp /= cat_metrics.found_n
        cat_metrics.tp /= cat_metrics.found_n
        cat_metrics.fn /= cat_metrics.gold_n

        cat_metrics['prec'] = cat_metrics.tp / (cat_metrics.tp + cat_metrics.fp)
        cat_metrics['rec'] = cat_metrics.tp / (cat_metrics.tp + cat_metrics.fn)
        cat_metrics['f1'] = 2 * cat_metrics.prec * cat_metrics.rec / (cat_metrics.prec + cat_metrics.rec)

        result.append(cat_metrics['gold_n found_n f1 prec rec fp tp fn full_dice tp_dice'.split(' ')])

    result = pandas.concat(result,
                           axis=1,
                           keys=categories)
    result.replace([numpy.inf, -numpy.inf], numpy.nan, inplace=True)
    result.fillna(0, inplace=True)
    return result


def bootstrap_micro(raw_metrics, n=10):
    subsampled_metrics = []
    for _ in range(n):
        subsample = raw_metrics.sample(frac=1, replace=True, axis=0)
        sub_micro = aggregate_df_metrics(subsample.sum(axis=0).to_frame().transpose())
        subsampled_metrics.append(sub_micro)
    return pandas.concat(subsampled_metrics, axis=0)


def calc_metrics_per_dir(in_dir, gold_rect_getter, rect_getter, extension='svg', min_tp_dice=0.1):
    metrics = []
    for f in list(glob.glob(os.path.join(in_dir, '*.{}'.format(extension)))):
        try:
            metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
        except BaseException as e:
            print('Could not eval on {} due to {}'.format(f, e))
            print(traceback.format_exc())

    metrics = pandas.DataFrame(metrics)
    macro = aggregate_df_metrics(metrics).describe().loc[['mean', 'std']]
    micro = bootstrap_micro(metrics).describe().loc[['mean', 'std']]

    result = pandas.concat([macro, micro],
                            axis=1,
                            keys=['macro', 'micro'])
#     result = result.reorder_levels([1, 2, 0], axis=1)
#     result.sort_index(level=0, axis=1, inplace=True)
    return result


def make_nn_rect_getter(model, **process_kwargs):
    def _impl(svg_file):
        with tempfile.TemporaryDirectory() as wd:
            in_fname, _ = convert_svg(svg_file, wd)
            boxes = process_page(model, in_fname, wd, **process_kwargs)
            return dict(body=[box for channel, box in boxes if channel == 1],
                        surrounding_text=[box for channel, box in boxes if channel == 0])
    return _impl


def get_pdf_file_and_page(pdfs_dir, svg_file, page_no_one_based=False):
    # take first and last elements of splitted filename
    doc_id, *_, page_no = os.path.splitext(os.path.basename(svg_file))[0].split('-')
    page_no = int(page_no) # expected to be zero-based
    if page_no_one_based:
        page_no -= 1
    return os.path.join(pdfs_dir, doc_id + '.pdf'), page_no


def make_pure_nn_rect_getter(model, **process_kwargs):
    def _impl(svg_file):
        with tempfile.TemporaryDirectory() as wd:
            in_fname, _ = convert_svg(svg_file, wd)
            boxes = process_page(model, in_fname, wd, **process_kwargs)
            return dict(body=[box for channel, box in boxes if channel == 1],
                        surrounding_text=[box for channel, box in boxes if channel == 0])
    return _impl


def make_nn_tabula_rect_getter(model, pdfs_dir, page_no_one_based=False, **process_kwargs):
    def _impl(svg_file):
        with tempfile.TemporaryDirectory() as wd:
            pdf_fname, page_no = get_pdf_file_and_page(pdfs_dir, svg_file, page_no_one_based=page_no_one_based)
            tinfos = process_pdf(model, pdf_fname, pages=[page_no])
            return dict(body=[tinfo.body_box for tinfo in tinfos],
                        surrounding_text=[box
                                          for tinfo in tinfos
                                          for box in tinfo.surrounding_text_boxes])
    return _impl


def tabula_is_bad_cell(cell_info):
    return cell_info['height'] == 0 or cell_info['width'] == 0


def get_tabula_table_rect(tabula_table_info):
    top = 1e100
    left = 1e100
    bottom = 0
    right = 0
    for row in tabula_table_info['data']:
        for cell in row:
            if tabula_is_bad_cell(cell):
                continue
            if cell['top'] < top:
                top = cell['top']
            if cell['left'] < left:
                left = cell['left']
            cell_bottom = cell['top'] + cell['height']
            if cell_bottom > bottom:
                bottom = cell_bottom
            cell_right = cell['left'] + cell['width']
            if cell_right > right:
                right = cell_right
    return (top, left, bottom, right)


def make_pure_tabula_rect_getter(pdfs_dir, **tabula_kwargs):
    def _impl(svg_file):
        pdf_fname, page_no = get_pdf_file_and_page(pdfs_dir, svg_file)
        tabula_res = tabula.read_pdf(pdf_fname, output_format='json', pages=page_no + 1)
        return dict(body=[get_tabula_table_rect(table) for table in tabula_res])
    return _impl


def make_pdffigures_rect_getter(pdffigures_res_dir, **tabula_kwargs):
    def _impl(svg_file):
        pdf_fname, page_no = get_pdf_file_and_page(pdffigures_res_dir, svg_file)
        json_fname = os.path.splitext(pdf_fname)[0] + '.json'
        with open(json_fname, 'r') as f:
            pdffigures_res = json.load(f)
        page_tables = [fig for fig in pdffigures_res
                       if fig['figType'] == 'Table'
                       and fig['page'] == page_no] # pdffigures2 uses 0-based page numbering
        return dict(body=[pdffigures_format_rect_from_dict(t['regionBoundary']) for t in page_tables],
                    surrounding_text=[pdffigures_format_rect_from_dict(t['captionBoundary']) for t in page_tables])
    return _impl

Evaluate pure model



In [5]:

    
pure_nn_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    fold_model = segmentation_model = load_model(os.path.join(fold, 'model'),
                                                 custom_objects=model.__dict__)
    pure_nn_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                                    get_svg_rectangles,
                                                    make_pure_nn_rect_getter(fold_model),
                                                    min_tp_dice=0.1))
pure_nn_eval_agg = pandas.DataFrame([r.loc['mean'] for r in pure_nn_eval_result]).describe()
pure_nn_eval_agg.transpose()[['mean', 'std']]









    Out[5]:







  
    
      
      
      
      mean
      std
    
  
  
    
      macro
      body
      gold_n
      1.141149
      0.114382
    
    
      found_n
      1.401379
      0.223328
    
    
      f1
      0.903669
      0.056467
    
    
      prec
      0.880548
      0.069092
    
    
      rec
      0.966590
      0.024446
    
    
      fp
      0.119452
      0.069092
    
    
      tp
      0.880548
      0.069092
    
    
      fn
      0.023448
      0.021617
    
    
      full_dice
      0.812553
      0.075504
    
    
      tp_dice
      0.903935
      0.033206
    
    
      surrounding_text
      gold_n
      1.993793
      0.175679
    
    
      found_n
      2.220460
      0.163783
    
    
      f1
      0.883068
      0.067647
    
    
      prec
      0.869180
      0.076181
    
    
      rec
      0.927671
      0.057664
    
    
      fp
      0.130820
      0.076181
    
    
      tp
      0.869180
      0.076181
    
    
      fn
      0.070284
      0.060733
    
    
      full_dice
      0.669518
      0.096178
    
    
      tp_dice
      0.736466
      0.069719
    
    
      micro
      body
      gold_n
      33.820000
      3.742593
    
    
      found_n
      41.480000
      6.674354
    
    
      f1
      0.862285
      0.081962
    
    
      prec
      0.796526
      0.117163
    
    
      rec
      0.949871
      0.054951
    
    
      fp
      0.203474
      0.117163
    
    
      tp
      0.796526
      0.117163
    
    
      fn
      0.042554
      0.044005
    
    
      full_dice
      0.732024
      0.122049
    
    
      tp_dice
      0.915295
      0.025316
    
    
      surrounding_text
      gold_n
      58.260000
      5.848333
    
    
      found_n
      65.020000
      5.477865
    
    
      f1
      0.869754
      0.069923
    
    
      prec
      0.833570
      0.096253
    
    
      rec
      0.914769
      0.056330
    
    
      fp
      0.166430
      0.096253
    
    
      tp
      0.833570
      0.096253
    
    
      fn
      0.078811
      0.053997
    
    
      full_dice
      0.632733
      0.110706
    
    
      tp_dice
      0.754627
      0.052377

Evaluate model+tabula



In [6]:

    
nn_tabula_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    fold_model = segmentation_model = load_model(os.path.join(fold, 'model'),
                                                 custom_objects=model.__dict__)
    nn_tabula_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                                      get_svg_rectangles,
                                                      make_nn_tabula_rect_getter(fold_model, '/notebook/data/0_source_pdfs/'),
                                                      min_tp_dice=0.1))
nn_tabula_eval_agg = pandas.DataFrame([r.loc['mean'] for r in nn_tabula_eval_result]).describe()
nn_tabula_eval_agg.transpose()[['mean', 'std']]









    



Could not eval on /notebook/data/6_eval/1/test/17197902-0003.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/1/test/18207305-0002.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/1/test/18207305-0001.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/1/test/17669561-0007.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/3/test/17274750-0002.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/3/test/18207305-0003.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/4/test/17274750-0001.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/4/test/17669561-0003.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/4/test/17669561-0004.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/0/test/18950932-0001.svg due to 






    Out[6]:







  
    
      
      
      
      mean
      std
    
  
  
    
      macro
      body
      gold_n
      1.151319
      0.107257
    
    
      found_n
      1.087912
      0.080378
    
    
      f1
      0.937015
      0.040542
    
    
      prec
      0.945661
      0.037127
    
    
      rec
      0.934056
      0.038731
    
    
      fp
      0.025979
      0.015417
    
    
      tp
      0.945661
      0.037127
    
    
      fn
      0.066876
      0.047938
    
    
      full_dice
      0.846998
      0.035813
    
    
      tp_dice
      0.851393
      0.030730
    
    
      surrounding_text
      gold_n
      1.994347
      0.149263
    
    
      found_n
      0.730411
      0.106578
    
    
      f1
      0.483515
      0.067686
    
    
      prec
      0.571302
      0.076951
    
    
      rec
      0.425158
      0.062340
    
    
      fp
      0.060484
      0.033095
    
    
      tp
      0.571302
      0.076951
    
    
      fn
      0.641930
      0.064289
    
    
      full_dice
      0.370886
      0.052942
    
    
      tp_dice
      0.372481
      0.054502
    
    
      micro
      body
      gold_n
      32.300000
      3.369718
    
    
      found_n
      30.460000
      2.117310
    
    
      f1
      0.942167
      0.038871
    
    
      prec
      0.965468
      0.027427
    
    
      rec
      0.921209
      0.051651
    
    
      fp
      0.034532
      0.027427
    
    
      tp
      0.965468
      0.027427
    
    
      fn
      0.085168
      0.058078
    
    
      full_dice
      0.862897
      0.033524
    
    
      tp_dice
      0.894209
      0.039020
    
    
      surrounding_text
      gold_n
      55.620000
      4.053024
    
    
      found_n
      20.120000
      3.920077
    
    
      f1
      0.703576
      0.039193
    
    
      prec
      0.906132
      0.045132
    
    
      rec
      0.575580
      0.036015
    
    
      fp
      0.093868
      0.045132
    
    
      tp
      0.906132
      0.045132
    
    
      fn
      0.668813
      0.073337
    
    
      full_dice
      0.591374
      0.072365
    
    
      tp_dice
      0.650999
      0.059610

Evaluate model+tabula - full train



In [21]:

    
nn_tabula_full_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    nn_tabula_full_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                      get_svg_rectangles,
                                      make_nn_tabula_rect_getter(segmentation_model, '/notebook/data/0_source_pdfs/'),
                                      min_tp_dice=0.1))
nn_tabula_full_eval_agg = pandas.DataFrame([r.loc['mean'] for r in nn_tabula_full_eval_result]).describe()
nn_tabula_full_eval_agg.transpose()[['mean', 'std']]









    



Could not eval on /notebook/data/6_eval/1/test/17197902-0003.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/1/test/19794983-0003.svg due to Error tokenizing data. C error: Expected 1 fields in line 6, saw 2

Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 192, in process_pdf
    body = extract_table_with_tabula(in_file, page_i + 1, body_box)
  File "<ipython-input-3-9f22be994290>", line 113, in extract_table_with_tabula
    return tabula.read_pdf(pdf_file, guess=False, pages=page_no, area=tuple(box))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/tabula/wrapper.py", line 97, in read_pdf
    return pd.read_csv(io.BytesIO(output), **pandas_options)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 655, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 411, in _read
    data = parser.read(nrows)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 1005, in read
    ret = self._engine.read(nrows)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 1748, in read
    data = self._reader.read(nrows)
  File "pandas/_libs/parsers.pyx", line 890, in pandas._libs.parsers.TextReader.read (pandas/_libs/parsers.c:10862)
  File "pandas/_libs/parsers.pyx", line 912, in pandas._libs.parsers.TextReader._read_low_memory (pandas/_libs/parsers.c:11138)
  File "pandas/_libs/parsers.pyx", line 966, in pandas._libs.parsers.TextReader._read_rows (pandas/_libs/parsers.c:11884)
  File "pandas/_libs/parsers.pyx", line 953, in pandas._libs.parsers.TextReader._tokenize_rows (pandas/_libs/parsers.c:11755)
  File "pandas/_libs/parsers.pyx", line 2184, in pandas._libs.parsers.raise_parser_error (pandas/_libs/parsers.c:28765)
pandas.errors.ParserError: Error tokenizing data. C error: Expected 1 fields in line 6, saw 2


Could not eval on /notebook/data/6_eval/1/test/18207305-0002.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/1/test/18207305-0001.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/1/test/17669561-0007.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/3/test/17274750-0002.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/3/test/18207305-0003.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/4/test/17274750-0001.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/4/test/17669561-0003.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/4/test/17669561-0004.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/0/test/18950932-0001.svg due to 
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 180, in process_pdf
    page = parsed_pdf.get_page(page_i)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 595, in get_page
    return self._cached_pages(target_page=page_number)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 627, in _cached_pages
    next_page = next(self._pages_iter)
StopIteration







    Out[21]:







  
    
      
      
      
      mean
      std
    
  
  
    
      macro
      body
      gold_n
      1.160549
      0.095025
    
    
      found_n
      1.052095
      0.098655
    
    
      f1
      0.925284
      0.060907
    
    
      prec
      0.935260
      0.045418
    
    
      rec
      0.918657
      0.070972
    
    
      fp
      0.013810
      0.018928
    
    
      tp
      0.935260
      0.045418
    
    
      fn
      0.089788
      0.085035
    
    
      full_dice
      0.855598
      0.062039
    
    
      tp_dice
      0.855598
      0.062039
    
    
      surrounding_text
      gold_n
      2.008945
      0.126738
    
    
      found_n
      0.675626
      0.081495
    
    
      f1
      0.450365
      0.057326
    
    
      prec
      0.528766
      0.066409
    
    
      rec
      0.398208
      0.051894
    
    
      fp
      0.055359
      0.026994
    
    
      tp
      0.528766
      0.066409
    
    
      fn
      0.667816
      0.049357
    
    
      full_dice
      0.416132
      0.055835
    
    
      tp_dice
      0.418778
      0.052229
    
    
      micro
      body
      gold_n
      31.740000
      3.444271
    
    
      found_n
      28.940000
      2.725436
    
    
      f1
      0.950101
      0.046783
    
    
      prec
      0.985612
      0.024365
    
    
      rec
      0.920442
      0.078647
    
    
      fp
      0.014388
      0.024365
    
    
      tp
      0.985612
      0.024365
    
    
      fn
      0.093861
      0.101933
    
    
      full_dice
      0.900556
      0.029966
    
    
      tp_dice
      0.913698
      0.021993
    
    
      surrounding_text
      gold_n
      54.380000
      4.580611
    
    
      found_n
      18.620000
      3.425201
    
    
      f1
      0.695261
      0.041947
    
    
      prec
      0.901733
      0.053354
    
    
      rec
      0.566132
      0.034993
    
    
      fp
      0.098267
      0.053354
    
    
      tp
      0.901733
      0.053354
    
    
      fn
      0.690549
      0.056853
    
    
      full_dice
      0.715125
      0.050853
    
    
      tp_dice
      0.792002
      0.017137

Evaluate pure pdffigures2



In [15]:

    
pdffigures_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    pdffigures_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                                       get_svg_rectangles,
                                                       make_pdffigures_rect_getter('/notebook/data/1_pdffigures2_out/'),
                                                       min_tp_dice=0.1))
pdffigures_eval_agg = pandas.DataFrame([r.loc['mean'] for r in pdffigures_eval_result]).describe()
pdffigures_eval_agg.transpose()[['mean', 'std']]









    Out[15]:







  
    
      
      
      
      mean
      std
    
  
  
    
      macro
      body
      gold_n
      1.133333
      0.113039
    
    
      found_n
      1.093333
      0.064118
    
    
      f1
      0.932111
      0.040186
    
    
      prec
      0.943333
      0.046547
    
    
      rec
      0.926222
      0.037921
    
    
      fp
      0.056667
      0.046547
    
    
      tp
      0.943333
      0.046547
    
    
      fn
      0.064444
      0.036977
    
    
      full_dice
      0.808275
      0.064085
    
    
      tp_dice
      0.811512
      0.069472
    
    
      surrounding_text
      gold_n
      1.980000
      0.170945
    
    
      found_n
      1.093333
      0.064118
    
    
      f1
      0.759039
      0.033478
    
    
      prec
      0.906667
      0.043461
    
    
      rec
      0.661135
      0.031272
    
    
      fp
      0.093333
      0.043461
    
    
      tp
      0.906667
      0.043461
    
    
      fn
      0.443889
      0.054569
    
    
      full_dice
      0.380587
      0.035512
    
    
      tp_dice
      0.380015
      0.035703
    
    
      micro
      body
      gold_n
      34.280000
      3.631391
    
    
      found_n
      33.180000
      3.016123
    
    
      f1
      0.937248
      0.040129
    
    
      prec
      0.950276
      0.044156
    
    
      rec
      0.926139
      0.050250
    
    
      fp
      0.049724
      0.044156
    
    
      tp
      0.950276
      0.044156
    
    
      fn
      0.077997
      0.055654
    
    
      full_dice
      0.805343
      0.065503
    
    
      tp_dice
      0.846678
      0.034291
    
    
      surrounding_text
      gold_n
      59.940000
      6.010241
    
    
      found_n
      33.180000
      3.016123
    
    
      f1
      0.759332
      0.050709
    
    
      prec
      0.914411
      0.070916
    
    
      rec
      0.649574
      0.038925
    
    
      fp
      0.085589
      0.070916
    
    
      tp
      0.914411
      0.070916
    
    
      fn
      0.491731
      0.045517
    
    
      full_dice
      0.380925
      0.041536
    
    
      tp_dice
      0.415237
      0.020917

Evaluate pure tabula



In [13]:

    
pure_tabula_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    pure_tabula_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                                        get_svg_rectangles,
                                                        make_pure_tabula_rect_getter('/notebook/data/0_source_pdfs/'),
                                                        min_tp_dice=0.1))
pure_tabula_eval_agg = pandas.DataFrame([r.loc['mean'] for r in pure_tabula_eval_result]).describe()
pure_tabula_eval_agg.transpose()[['mean', 'std']]









    Out[13]:







  
    
      
      
      
      mean
      std
    
  
  
    
      macro
      body
      gold_n
      1.147816
      0.101968
    
    
      found_n
      1.164138
      0.149322
    
    
      f1
      0.612304
      0.038001
    
    
      prec
      0.589272
      0.038479
    
    
      rec
      0.681245
      0.052947
    
    
      fp
      0.256015
      0.078403
    
    
      tp
      0.589272
      0.038479
    
    
      fn
      0.322567
      0.064521
    
    
      full_dice
      0.382755
      0.045143
    
    
      tp_dice
      0.464457
      0.055415
    
    
      surrounding_text
      gold_n
      2.005747
      0.154252
    
    
      found_n
      0.000000
      0.000000
    
    
      f1
      0.000000
      0.000000
    
    
      prec
      0.000000
      0.000000
    
    
      rec
      0.000000
      0.000000
    
    
      fp
      0.000000
      0.000000
    
    
      tp
      0.000000
      0.000000
    
    
      fn
      0.993103
      0.015421
    
    
      full_dice
      0.000000
      0.000000
    
    
      tp_dice
      0.000000
      0.000000
    
    
      micro
      body
      gold_n
      34.120000
      2.981946
    
    
      found_n
      34.800000
      4.403408
    
    
      f1
      0.657371
      0.059450
    
    
      prec
      0.661827
      0.086219
    
    
      rec
      0.659942
      0.051743
    
    
      fp
      0.338173
      0.086219
    
    
      tp
      0.661827
      0.086219
    
    
      fn
      0.346269
      0.077654
    
    
      full_dice
      0.435130
      0.070421
    
    
      tp_dice
      0.653864
      0.057612
    
    
      surrounding_text
      gold_n
      59.300000
      4.669047
    
    
      found_n
      0.000000
      0.000000
    
    
      f1
      0.000000
      0.000000
    
    
      prec
      0.000000
      0.000000
    
    
      rec
      0.000000
      0.000000
    
    
      fp
      0.000000
      0.000000
    
    
      tp
      0.000000
      0.000000
    
    
      fn
      1.000000
      0.000000
    
    
      full_dice
      0.000000
      0.000000
    
    
      tp_dice
      0.000000
      0.000000

Summary



In [18]:

    
summary_our_dataset = pandas.concat([
                                     pure_nn_eval_agg.transpose()[['mean', 'std']],
                                     nn_tabula_eval_agg.transpose()[['mean', 'std']],
                                     pdffigures_eval_agg.transpose()[['mean', 'std']],
                                     pure_tabula_eval_agg.transpose()[['mean', 'std']],
                                     ],
                                    axis=1,
                                    keys=[
                                          'pure_nn',
                                          'nn_tabula',
                                          'pdffigures2',
                                          'pure_tabula'
                                          ])
summary_our_dataset









    Out[18]:







  
    
      
      
      
      pure_nn
      nn_tabula
      pdffigures2
      pure_tabula
    
    
      
      
      
      mean
      std
      mean
      std
      mean
      std
      mean
      std
    
  
  
    
      macro
      body
      gold_n
      1.141149
      0.114382
      1.151319
      0.107257
      1.133333
      0.113039
      1.147816
      0.101968
    
    
      found_n
      1.401379
      0.223328
      1.087912
      0.080378
      1.093333
      0.064118
      1.164138
      0.149322
    
    
      f1
      0.903669
      0.056467
      0.937015
      0.040542
      0.932111
      0.040186
      0.612304
      0.038001
    
    
      prec
      0.880548
      0.069092
      0.945661
      0.037127
      0.943333
      0.046547
      0.589272
      0.038479
    
    
      rec
      0.966590
      0.024446
      0.934056
      0.038731
      0.926222
      0.037921
      0.681245
      0.052947
    
    
      fp
      0.119452
      0.069092
      0.025979
      0.015417
      0.056667
      0.046547
      0.256015
      0.078403
    
    
      tp
      0.880548
      0.069092
      0.945661
      0.037127
      0.943333
      0.046547
      0.589272
      0.038479
    
    
      fn
      0.023448
      0.021617
      0.066876
      0.047938
      0.064444
      0.036977
      0.322567
      0.064521
    
    
      full_dice
      0.812553
      0.075504
      0.846998
      0.035813
      0.808275
      0.064085
      0.382755
      0.045143
    
    
      tp_dice
      0.903935
      0.033206
      0.851393
      0.030730
      0.811512
      0.069472
      0.464457
      0.055415
    
    
      surrounding_text
      gold_n
      1.993793
      0.175679
      1.994347
      0.149263
      1.980000
      0.170945
      2.005747
      0.154252
    
    
      found_n
      2.220460
      0.163783
      0.730411
      0.106578
      1.093333
      0.064118
      0.000000
      0.000000
    
    
      f1
      0.883068
      0.067647
      0.483515
      0.067686
      0.759039
      0.033478
      0.000000
      0.000000
    
    
      prec
      0.869180
      0.076181
      0.571302
      0.076951
      0.906667
      0.043461
      0.000000
      0.000000
    
    
      rec
      0.927671
      0.057664
      0.425158
      0.062340
      0.661135
      0.031272
      0.000000
      0.000000
    
    
      fp
      0.130820
      0.076181
      0.060484
      0.033095
      0.093333
      0.043461
      0.000000
      0.000000
    
    
      tp
      0.869180
      0.076181
      0.571302
      0.076951
      0.906667
      0.043461
      0.000000
      0.000000
    
    
      fn
      0.070284
      0.060733
      0.641930
      0.064289
      0.443889
      0.054569
      0.993103
      0.015421
    
    
      full_dice
      0.669518
      0.096178
      0.370886
      0.052942
      0.380587
      0.035512
      0.000000
      0.000000
    
    
      tp_dice
      0.736466
      0.069719
      0.372481
      0.054502
      0.380015
      0.035703
      0.000000
      0.000000
    
    
      micro
      body
      gold_n
      33.820000
      3.742593
      32.300000
      3.369718
      34.280000
      3.631391
      34.120000
      2.981946
    
    
      found_n
      41.480000
      6.674354
      30.460000
      2.117310
      33.180000
      3.016123
      34.800000
      4.403408
    
    
      f1
      0.862285
      0.081962
      0.942167
      0.038871
      0.937248
      0.040129
      0.657371
      0.059450
    
    
      prec
      0.796526
      0.117163
      0.965468
      0.027427
      0.950276
      0.044156
      0.661827
      0.086219
    
    
      rec
      0.949871
      0.054951
      0.921209
      0.051651
      0.926139
      0.050250
      0.659942
      0.051743
    
    
      fp
      0.203474
      0.117163
      0.034532
      0.027427
      0.049724
      0.044156
      0.338173
      0.086219
    
    
      tp
      0.796526
      0.117163
      0.965468
      0.027427
      0.950276
      0.044156
      0.661827
      0.086219
    
    
      fn
      0.042554
      0.044005
      0.085168
      0.058078
      0.077997
      0.055654
      0.346269
      0.077654
    
    
      full_dice
      0.732024
      0.122049
      0.862897
      0.033524
      0.805343
      0.065503
      0.435130
      0.070421
    
    
      tp_dice
      0.915295
      0.025316
      0.894209
      0.039020
      0.846678
      0.034291
      0.653864
      0.057612
    
    
      surrounding_text
      gold_n
      58.260000
      5.848333
      55.620000
      4.053024
      59.940000
      6.010241
      59.300000
      4.669047
    
    
      found_n
      65.020000
      5.477865
      20.120000
      3.920077
      33.180000
      3.016123
      0.000000
      0.000000
    
    
      f1
      0.869754
      0.069923
      0.703576
      0.039193
      0.759332
      0.050709
      0.000000
      0.000000
    
    
      prec
      0.833570
      0.096253
      0.906132
      0.045132
      0.914411
      0.070916
      0.000000
      0.000000
    
    
      rec
      0.914769
      0.056330
      0.575580
      0.036015
      0.649574
      0.038925
      0.000000
      0.000000
    
    
      fp
      0.166430
      0.096253
      0.093868
      0.045132
      0.085589
      0.070916
      0.000000
      0.000000
    
    
      tp
      0.833570
      0.096253
      0.906132
      0.045132
      0.914411
      0.070916
      0.000000
      0.000000
    
    
      fn
      0.078811
      0.053997
      0.668813
      0.073337
      0.491731
      0.045517
      1.000000
      0.000000
    
    
      full_dice
      0.632733
      0.110706
      0.591374
      0.072365
      0.380925
      0.041536
      0.000000
      0.000000
    
    
      tp_dice
      0.754627
      0.052377
      0.650999
      0.059610
      0.415237
      0.020917
      0.000000
      0.000000

Evaluate on pdffigures2 datasets



In [12]:

    
# !apt-get update && apt-get install -yqq poppler-utils
# !python /notebook/pdffigures2/evaluation/download_from_urls.py -g -c

Their dataset, our pipeline



In [9]:

    
def pdffigures_format_rect_from_list(lst):
    x1, y1, x2, y2 = lst
    return (y1, x1, y2, x2)

def make_pdffigures2_gold_rects_getter(annotations_file):
    with open(annotations_file, 'r') as f:
        annotations = json.load(f)

    def _impl(file):
        doc_id, *_, page_no = os.path.splitext(os.path.basename(file))[0].split('-')
        page_no = int(page_no)

        page_tables = [fig for fig in annotations[doc_id]['figures']
                       if fig['figure_type'] == 'Table'
                       and fig['page'] == page_no]
        return dict(body=[pdffigures_format_rect_from_list(fig['region_bb']) for fig in page_tables],
                    surrounding_text=[pdffigures_format_rect_from_list(t['caption_bb']) for t in page_tables])
    return _impl

CS-150 dataset



In [14]:

    
segmentation_model = load_model('/notebook/models/full_model', custom_objects=model.__dict__)
nn_tabula_pf2_eval_result = calc_metrics_per_dir('/notebook/pdffigures2/evaluation/datasets/conference/page_images_color_only_with_tables/',
                                                 make_pdffigures2_gold_rects_getter('/notebook/pdffigures2/evaluation/datasets/conference/annotations.json'),
                                                 make_nn_tabula_rect_getter(segmentation_model,
                                                                            '/notebook/pdffigures2/evaluation/datasets/conference/pdfs/',
                                                                            page_no_one_based=True),
                                                 min_tp_dice=0.2,
                                                 extension='jpg')
nn_tabula_pf2_eval_result.transpose()[['mean', 'std']]









    












    












    












    












    












    












    












    












    












    












    












    












    












    












    












    












    












    












    












    












    Out[14]:







  
    
      
      
      
      mean
      std
    
  
  
    
      macro
      surrounding_text
      gold_n
      0.666667
      0.516398
    
    
      found_n
      0.833333
      1.602082
    
    
      f1
      0.233333
      0.408248
    
    
      prec
      0.208333
      0.400520
    
    
      rec
      0.333333
      0.516398
    
    
      fp
      0.125000
      0.306186
    
    
      tp
      0.208333
      0.400520
    
    
      fn
      0.333333
      0.516398
    
    
      full_dice
      0.065484
      0.102478
    
    
      tp_dice
      0.152247
      0.279583
    
    
      body
      gold_n
      0.666667
      0.516398
    
    
      found_n
      1.500000
      0.836660
    
    
      f1
      0.444444
      0.501848
    
    
      prec
      0.416667
      0.491596
    
    
      rec
      0.500000
      0.547723
    
    
      fp
      0.583333
      0.491596
    
    
      tp
      0.416667
      0.491596
    
    
      fn
      0.166667
      0.408248
    
    
      full_dice
      0.412880
      0.443159
    
    
      tp_dice
      0.464098
      0.510009
    
    
      micro
      surrounding_text
      gold_n
      3.400000
      2.118700
    
    
      found_n
      3.600000
      2.503331
    
    
      f1
      0.553935
      0.296158
    
    
      prec
      0.547500
      0.348897
    
    
      rec
      0.623969
      0.336987
    
    
      fp
      0.352500
      0.316327
    
    
      tp
      0.547500
      0.348897
    
    
      fn
      0.383333
      0.330572
    
    
      full_dice
      0.175890
      0.064312
    
    
      tp_dice
      0.349353
      0.194061
    
    
      body
      gold_n
      3.400000
      2.118700
    
    
      found_n
      7.300000
      4.029061
    
    
      f1
      0.509852
      0.219189
    
    
      prec
      0.401515
      0.202438
    
    
      rec
      0.797689
      0.297804
    
    
      fp
      0.598485
      0.202438
    
    
      tp
      0.401515
      0.202438
    
    
      fn
      0.157143
      0.254328
    
    
      full_dice
      0.389217
      0.180099
    
    
      tp_dice
      0.916280
      0.034531

S2 dataset



In [ ]:

    
# segmentation_model = load_model('/notebook/models/full_model', custom_objects=model.__dict__)
# nn_tabula_pf2_eval_result = calc_metrics_per_dir('/notebook/pdffigures2/evaluation/datasets/s2/page_images_color/',
#                                                  make_pdffigures2_gold_rects_getter('/notebook/pdffigures2/evaluation/datasets/s2/annotations.json'),
#                                                  make_nn_tabula_rect_getter(segmentation_model,
#                                                                             '/notebook/pdffigures2/evaluation/s2/conference/pdfs/',
#                                                                             page_no_one_based=True),
#                                                  extension='jpg')
# nn_tabula_pf2_eval_result.transpose()[['mean', 'std']]

Their dataset, their pipeline



In [ ]:

			mean	std
macro	body	gold_n	1.141149	0.114382
		found_n	1.401379	0.223328
		f1	0.903669	0.056467
		prec	0.880548	0.069092
		rec	0.966590	0.024446
		fp	0.119452	0.069092
		tp	0.880548	0.069092
		fn	0.023448	0.021617
		full_dice	0.812553	0.075504
		tp_dice	0.903935	0.033206
	surrounding_text	gold_n	1.993793	0.175679
		found_n	2.220460	0.163783
		f1	0.883068	0.067647
		prec	0.869180	0.076181
		rec	0.927671	0.057664
		fp	0.130820	0.076181
		tp	0.869180	0.076181
		fn	0.070284	0.060733
		full_dice	0.669518	0.096178
		tp_dice	0.736466	0.069719
micro	body	gold_n	33.820000	3.742593
		found_n	41.480000	6.674354
		f1	0.862285	0.081962
		prec	0.796526	0.117163
		rec	0.949871	0.054951
		fp	0.203474	0.117163
		tp	0.796526	0.117163
		fn	0.042554	0.044005
		full_dice	0.732024	0.122049
		tp_dice	0.915295	0.025316
	surrounding_text	gold_n	58.260000	5.848333
		found_n	65.020000	5.477865
		f1	0.869754	0.069923
		prec	0.833570	0.096253
		rec	0.914769	0.056330
		fp	0.166430	0.096253
		tp	0.833570	0.096253
		fn	0.078811	0.053997
		full_dice	0.632733	0.110706
		tp_dice	0.754627	0.052377

			pure_nn		nn_tabula		pdffigures2		pure_tabula
			mean	std	mean	std	mean	std	mean	std
macro	body	gold_n	1.141149	0.114382	1.151319	0.107257	1.133333	0.113039	1.147816	0.101968
		found_n	1.401379	0.223328	1.087912	0.080378	1.093333	0.064118	1.164138	0.149322
		f1	0.903669	0.056467	0.937015	0.040542	0.932111	0.040186	0.612304	0.038001
		prec	0.880548	0.069092	0.945661	0.037127	0.943333	0.046547	0.589272	0.038479
		rec	0.966590	0.024446	0.934056	0.038731	0.926222	0.037921	0.681245	0.052947
		fp	0.119452	0.069092	0.025979	0.015417	0.056667	0.046547	0.256015	0.078403
		tp	0.880548	0.069092	0.945661	0.037127	0.943333	0.046547	0.589272	0.038479
		fn	0.023448	0.021617	0.066876	0.047938	0.064444	0.036977	0.322567	0.064521
		full_dice	0.812553	0.075504	0.846998	0.035813	0.808275	0.064085	0.382755	0.045143
		tp_dice	0.903935	0.033206	0.851393	0.030730	0.811512	0.069472	0.464457	0.055415
	surrounding_text	gold_n	1.993793	0.175679	1.994347	0.149263	1.980000	0.170945	2.005747	0.154252
		found_n	2.220460	0.163783	0.730411	0.106578	1.093333	0.064118	0.000000	0.000000
		f1	0.883068	0.067647	0.483515	0.067686	0.759039	0.033478	0.000000	0.000000
		prec	0.869180	0.076181	0.571302	0.076951	0.906667	0.043461	0.000000	0.000000
		rec	0.927671	0.057664	0.425158	0.062340	0.661135	0.031272	0.000000	0.000000
		fp	0.130820	0.076181	0.060484	0.033095	0.093333	0.043461	0.000000	0.000000
		tp	0.869180	0.076181	0.571302	0.076951	0.906667	0.043461	0.000000	0.000000
		fn	0.070284	0.060733	0.641930	0.064289	0.443889	0.054569	0.993103	0.015421
		full_dice	0.669518	0.096178	0.370886	0.052942	0.380587	0.035512	0.000000	0.000000
		tp_dice	0.736466	0.069719	0.372481	0.054502	0.380015	0.035703	0.000000	0.000000
micro	body	gold_n	33.820000	3.742593	32.300000	3.369718	34.280000	3.631391	34.120000	2.981946
		found_n	41.480000	6.674354	30.460000	2.117310	33.180000	3.016123	34.800000	4.403408
		f1	0.862285	0.081962	0.942167	0.038871	0.937248	0.040129	0.657371	0.059450
		prec	0.796526	0.117163	0.965468	0.027427	0.950276	0.044156	0.661827	0.086219
		rec	0.949871	0.054951	0.921209	0.051651	0.926139	0.050250	0.659942	0.051743
		fp	0.203474	0.117163	0.034532	0.027427	0.049724	0.044156	0.338173	0.086219
		tp	0.796526	0.117163	0.965468	0.027427	0.950276	0.044156	0.661827	0.086219
		fn	0.042554	0.044005	0.085168	0.058078	0.077997	0.055654	0.346269	0.077654
		full_dice	0.732024	0.122049	0.862897	0.033524	0.805343	0.065503	0.435130	0.070421
		tp_dice	0.915295	0.025316	0.894209	0.039020	0.846678	0.034291	0.653864	0.057612
	surrounding_text	gold_n	58.260000	5.848333	55.620000	4.053024	59.940000	6.010241	59.300000	4.669047
		found_n	65.020000	5.477865	20.120000	3.920077	33.180000	3.016123	0.000000	0.000000
		f1	0.869754	0.069923	0.703576	0.039193	0.759332	0.050709	0.000000	0.000000
		prec	0.833570	0.096253	0.906132	0.045132	0.914411	0.070916	0.000000	0.000000
		rec	0.914769	0.056330	0.575580	0.036015	0.649574	0.038925	0.000000	0.000000
		fp	0.166430	0.096253	0.093868	0.045132	0.085589	0.070916	0.000000	0.000000
		tp	0.833570	0.096253	0.906132	0.045132	0.914411	0.070916	0.000000	0.000000
		fn	0.078811	0.053997	0.668813	0.073337	0.491731	0.045517	1.000000	0.000000
		full_dice	0.632733	0.110706	0.591374	0.072365	0.380925	0.041536	0.000000	0.000000
		tp_dice	0.754627	0.052377	0.650999	0.059610	0.415237	0.020917	0.000000	0.000000