In [ ]:

How to use the models we trained


In [1]:
# !pip install -U opencv-python tabula-py pdfquery

In [1]:
%load_ext autoreload
%autoreload 2

import os, numpy, glob, collections, random, \
    shutil, pandas, time, subprocess, itertools, \
    cv2, tempfile, scipy, pdfquery, lxml.etree, json, traceback
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]= "1"

from PIL import Image
from IPython.display import display, SVG
from joblib import Parallel, delayed

import matplotlib.pyplot as plt
%pylab inline

import model
from model import *
from prepare_images_utils import *
from train_test_augment import augment_image_deterministic, WINDOW_SIZE
from make_inout_pairs import get_stroke, convert_svg

import tabula

import warnings
warnings.filterwarnings('ignore')

pandas.set_option('max_columns', 100)


Populating the interactive namespace from numpy and matplotlib
/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['random']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
Using TensorFlow backend.

In [2]:
# segmentation_model = load_model('./models/contours.h5', custom_objects=dict(dice_coef_loss=dice_coef_loss))
segmentation_model = load_model('./models/joint1.h5', custom_objects=model.__dict__)

In [11]:
# Network outputs:
# (Footer, Header, Body) - hbf
# (Text - Header+Footer, Body) - bt

DENSITY=100
PIXELS_TO_POINTS_FACTOR = 72.0 / DENSITY
POINTS_TO_PIXELS_FACTOR = DENSITY / 72.0

def pdf_to_pages(in_file, out_dir, pages=None):
    if pages is None:
        subprocess.check_call(['convert',
                               '-define', 'pdf:use-cropbox=true',
                               '-density', str(DENSITY),
                               in_file,
                               '-sharpen', '0x1.0',
#                                '-resample', '{0}x{0}'.format(DENSITY),
                               os.path.join(out_dir, '%04d.png')])
    else:
        for page in pages:
            subprocess.check_call(['convert',
                                   '-define', 'pdf:use-cropbox=true',
                                   '-density', str(DENSITY),
                                   '{}[{}]'.format(in_file, page),
                                   '-sharpen', '0x1.0',
#                                    '-resample', '{0}x{0}'.format(DENSITY),
                                   os.path.join(out_dir, '{:04d}.png'.format(page))])
    result = list(glob.glob(os.path.join(out_dir, '*.png')))
    result.sort()
    return result


def make_demo_mask(page_image, boxes):
    demo_mask = numpy.zeros((page_image.size[1], page_image.size[0], 3),
                            dtype='uint8')
    for channel, box in boxes:
        y1, x1, y2, x2 = box
        color = [0] * 3
        color[channel] = 255
        cv2.drawContours(demo_mask,
                         [numpy.array([(x1, y1),
                                       (x2, y1),
                                       (x2, y2),
                                       (x1, y2)])],
                         -1,
                         tuple(color),
                         cv2.FILLED)

    demo_mask = arr_to_img(demo_mask.astype('float32') / 255.0)
    demo_mask_blended = Image.blend(page_image.convert('RGB'), demo_mask, 0.5)
    return demo_mask_blended


def process_page(segm_model, full_image_fname, out_dir, mask_threshold=0.5, min_contour_area=500, read_mode='L', read_proc=identity, take_channels=2):
    transformations = list(augment_image_deterministic(full_image_fname, out_dir,
                                                       scales=[(1, 1)],
                                                       rotations=[0]))
    transformed_images = numpy.expand_dims(read_images_to_tensor([f for _, _, _, _, _, f in transformations],
                                                                 mode=read_mode,
                                                                 proc=read_proc),
                                           -1)

    partial_masks = segm_model.predict(transformed_images)
    if isinstance(partial_masks, list):
        partial_masks = partial_masks[0]

    page_image = load_image_opaque(full_image_fname, mode='L')
#     print('page size', page_image.size)

    aggregated_mask = numpy.zeros((page_image.size[1], page_image.size[0], partial_masks.shape[-1]),
                                  dtype='float32')
    norm = numpy.zeros_like(aggregated_mask)
    for (scale_x, scale_y, x_off, y_off, angle, _), mask_pixels in zip(transformations, partial_masks):
        mask_width = min(WINDOW_SIZE[0], aggregated_mask.shape[1] - x_off)
        mask_height = min(WINDOW_SIZE[1], aggregated_mask.shape[0] - y_off)

        mask = arr_to_img(mask_pixels)
        mask = mask.rotate(-angle).resize((mask_width, mask_height), Image.BILINEAR)
        mask_pixels = numpy.array(mask)
        aggregated_mask[y_off:y_off+mask_pixels.shape[1],
                        x_off:x_off+mask_pixels.shape[0]] += mask_pixels

        norm[y_off:y_off+mask_pixels.shape[1],
             x_off:x_off+mask_pixels.shape[0], :] += 1
    aggregated_mask = numpy.nan_to_num(aggregated_mask / norm) / 255
    nan_mask = numpy.where(aggregated_mask.sum(-1) < 1e-3)
    aggregated_mask[nan_mask[0], nan_mask[1], -1] = 1

#     display(arr_to_img(aggregated_mask))

    # renormalize with softmax
    aggregated_mask = numpy.exp(aggregated_mask)
    aggregated_mask /= numpy.expand_dims(aggregated_mask.sum(-1), -1)
    
#     display(arr_to_img(aggregated_mask))
    
    # binarize by argmax
    max_idx = aggregated_mask.argmax(axis=-1)
    aggregated_mask_bin = numpy.zeros_like(aggregated_mask)
    for channel in range(aggregated_mask.shape[-1]):
        mask = numpy.where(max_idx == channel)
        aggregated_mask_bin[mask[0], mask[1], channel] = 1
#     display(arr_to_img(aggregated_mask_bin))

#     aggregated_mask = binarize_tensor(aggregated_mask, mask_threshold * 255.0)

    contours = [(channel,
                 cv2.findContours((aggregated_mask_bin[:, :, channel]).astype('uint8'),
                                  cv2.RETR_LIST,
                                  cv2.CHAIN_APPROX_SIMPLE)[1])
                for channel in range(take_channels)]

    result = []
    boxes_for_demo_mask = []
    for channel, cur_contours in contours:
        for cnt in cur_contours:
            if cv2.contourArea(cnt) < min_contour_area:
                continue
            x, y, w, h = cv2.boundingRect(cnt)
            result.append((channel, numpy.array([y, x, y+h, x+w]) * PIXELS_TO_POINTS_FACTOR))
            boxes_for_demo_mask.append((channel, numpy.array([y, x, y+h, x+w])))

#     print(result)
    demo_mask_blended = make_demo_mask(page_image, boxes_for_demo_mask)
#     display(demo_mask_blended)

    return result, demo_mask_blended


def extract_table_with_tabula(pdf_file, page_no, box):
    return tabula.read_pdf(pdf_file, guess=False, pages=page_no, area=tuple(box))


# CORNDER_COORDS_IDX = [(0, 1), (0, 3), (2, 1), (2, 3)]
# def box_to_corner_coords(box):
#     return [(box[i], box[j]) for i, j in CORNDER_COORDS_IDX]


# def two_box_distance(a, b):
#     point_pairs = itertools.product(box_to_corner_coords(a),
#                                     box_to_corner_coords(b))
#     distances = map(scipy.spatial.distance.euclidean, point_pairs)
#     return min(distances)


def rect_distance(a, b):
    y1, x1, y1b, x1b = a
    y2, x2, y2b, x2b = b
    left = x2b < x1
    right = x1b < x2
    bottom = y2b < y1
    top = y1b < y2
    if top and left:
        return scipy.spatial.distance.euclidean((x1, y1b), (x2b, y2))
    elif left and bottom:
        return scipy.spatial.distance.euclidean((x1, y1), (x2b, y2b))
    elif bottom and right:
        return scipy.spatial.distance.euclidean((x1b, y1), (x2, y2b))
    elif right and top:
        return scipy.spatial.distance.euclidean((x1b, y1b), (x2, y2))
    elif left:
        return x1 - x2b
    elif right:
        return x2 - x1b
    elif bottom:
        return y1 - y2b
    elif top:
        return y2 - y1b
    else:             # rectangles intersect
        return 0.


def convert_coords_to_pq(box, cropbox):
    ul_y, ul_x, br_y, br_x = box
    x_off, _, _, page_height = cropbox
    return numpy.array([ul_x + x_off, page_height - br_y, br_x + x_off, page_height - ul_y])


def convert_coords_from_pq(box, cropbox):
    bl_x, bl_y, ur_x, ur_y = box
    x_off, _, _, page_height = cropbox
    return numpy.array([page_height - ur_y, bl_x - x_off, page_height - bl_y, ur_x - x_off])


TableInfo = collections.namedtuple('TableInfo',
                                   'page surrounding_text_boxes surrounding_texts body_box body success'.split(' '))


def process_pdf(segm_model, in_file, tmp_dir_prefix='/tmp', return_only_successful=True, pad=2, pages=None, max_text_distance=20, min_table_cols=2, min_table_rows=2):
    result = []
    parsed_pdf = pdfquery.PDFQuery(in_file, parse_tree_cacher=pdfquery.cache.FileCache("/tmp/"), laparams=None)
    parsed_pdf.load()
    pdf_basename = os.path.splitext(os.path.basename(in_file))[0]
    with tempfile.TemporaryDirectory(dir=tmp_dir_prefix) as wd:
        page_filenames = pdf_to_pages(in_file, wd, pages=pages)

        for page_fname in page_filenames:
            page_i = int(os.path.splitext(os.path.basename(page_fname))[0])
            page = parsed_pdf.get_page(page_i)
#             print('id', page_i, page.pageid)
#             page_crop = page.cropbox
#             offsets = numpy.array([-page_crop[1] - pad, -page_crop[0] - pad, -page_crop[1] + pad, -page_crop[0] + pad])
            offsets = numpy.array([-pad, -pad, pad, pad])

            boxes, mask_blended = process_page(segm_model, page_fname, wd, mask_threshold=0.5)

            mask_blended.save('/notebook/papers/nn/pages/{}_{:04d}.png'.format(pdf_basename, page_i))

            this_page_result = []

            bodies = [box + offsets for channel, box in boxes if channel == 1]
            for body_box in bodies:
                body = extract_table_with_tabula(in_file, page_i + 1, body_box)
                table_info = TableInfo(page_i,
                                       [],
                                       [],
                                       body_box,
                                       body,
                                       (not body is None) and body.shape[0] >= min_table_rows and body.shape[1] >= min_table_cols)
                if table_info.success or not return_only_successful:
                    this_page_result.append(table_info)

#                 if table_info.success:
#                     display(body)

            if len(this_page_result) > 0:
                surrounding_texts = [box + offsets for channel, box in boxes if channel == 0]
                for text_box in surrounding_texts:
#                     print('src box', text_box)
#                     print('src conv', convert_coords_to_pq(text_box, page.cropbox))
                    text_query = 'LTPage[pageid="{}"] :in_bbox("{},{},{},{}")'.format(page_i + 1,
                                                                                      *convert_coords_to_pq(text_box,
                                                                                                            page.cropbox))
#                     print(text_query)
                    query_res = parsed_pdf.pq(text_query)
                    text = query_res.text()
                    if not text:
                        continue
#                     print('found box', query_res[0].layout.bbox)
#                     print('conv back', convert_coords_from_pq(query_res[0].layout.bbox, page.cropbox))
#                     print('cropbox', page.cropbox)
#                     print(text)
                    distances_to_tables = [rect_distance(text_box, table.body_box)
                                           for table in this_page_result]
                    closest = numpy.argmin(distances_to_tables)
                    if distances_to_tables[closest] < max_text_distance:
                        table = this_page_result[closest]
                        table.surrounding_text_boxes.append(text_box)
                        table.surrounding_texts.append(text)

            result.extend(this_page_result)
    return result


def pdffigures_format_rect_from_dict(rect_info):
    return numpy.array([rect_info['y1'], rect_info['x1'], rect_info['y2'], rect_info['x2']])


def pf2_detect_tables(in_file, tmp_dir_prefix='/tmp'):
    with tempfile.TemporaryDirectory(dir=tmp_dir_prefix) as wd:
        subprocess.check_call(['sbt',
                               "run-main org.allenai.pdffigures2.FigureExtractorBatchCli {0} -e -m {1} -d {1}".format(in_file, wd)],
                              cwd='/notebook/pdffigures2')

        result_by_page = collections.defaultdict(list)
        for res_file in glob.glob(os.path.join(wd, '*.json')):
            with open(res_file, 'r') as f:
                parse_res = json.load(f)
            for table_info in parse_res:
                if table_info['figType'] != 'Table':
                    continue
                page_no = table_info['page']
                result_by_page[page_no].append((0, pdffigures_format_rect_from_dict(table_info['captionBoundary'])))
                result_by_page[page_no].append((1, pdffigures_format_rect_from_dict(table_info['regionBoundary'])))
        return result_by_page


def process_pdf_pf2(segm_model, in_file, tmp_dir_prefix='/tmp', return_only_successful=True, pad=2, pages=None, max_text_distance=20, min_table_cols=2, min_table_rows=2):
    result = []
    parsed_pdf = pdfquery.PDFQuery(in_file, parse_tree_cacher=pdfquery.cache.FileCache("/tmp/"), laparams=None)
    parsed_pdf.load()
    pdf_basename = os.path.splitext(os.path.basename(in_file))[0]
    with tempfile.TemporaryDirectory(dir=tmp_dir_prefix) as wd:
        page_filenames = pdf_to_pages(in_file, wd, pages=pages)
        boxes_by_pages = pf2_detect_tables(in_file, tmp_dir_prefix=wd)

        for page_fname in page_filenames:
            page_i = int(os.path.splitext(os.path.basename(page_fname))[0])
            page = parsed_pdf.get_page(page_i)
            offsets = numpy.array([-pad, -pad, pad, pad])

            boxes = boxes_by_pages.get(page_i, [])
            boxes_for_display = [(ch, box * POINTS_TO_PIXELS_FACTOR) for ch, box in boxes]
            page_image = load_image_opaque(page_fname, mode='L')
            mask_blended = make_demo_mask(page_image, boxes_for_display)
            mask_blended.save('/notebook/papers/pf/pages/{}_{:04d}.png'.format(pdf_basename, page_i))

            this_page_result = []

            bodies = [box + offsets for channel, box in boxes if channel == 1]
            for body_box in bodies:
                body = extract_table_with_tabula(in_file, page_i + 1, body_box)
                table_info = TableInfo(page_i,
                                       [],
                                       [],
                                       body_box,
                                       body,
                                       (not body is None) and body.shape[0] >= min_table_rows and body.shape[1] >= min_table_cols)
                if table_info.success or not return_only_successful:
                    this_page_result.append(table_info)

#                 if table_info.success:
#                     display(body)

            if len(this_page_result) > 0:
                surrounding_texts = [box + offsets for channel, box in boxes if channel == 0]
                for text_box in surrounding_texts:
#                     print('src box', text_box)
#                     print('src conv', convert_coords_to_pq(text_box, page.cropbox))
                    text_query = 'LTPage[pageid="{}"] :in_bbox("{},{},{},{}")'.format(page_i + 1,
                                                                                      *convert_coords_to_pq(text_box,
                                                                                                            page.cropbox))
#                     print(text_query)
                    query_res = parsed_pdf.pq(text_query)
                    text = query_res.text()
                    if not text:
                        continue
#                     print('found box', query_res[0].layout.bbox)
#                     print('conv back', convert_coords_from_pq(query_res[0].layout.bbox, page.cropbox))
#                     print('cropbox', page.cropbox)
#                     print(text)
                    distances_to_tables = [rect_distance(text_box, table.body_box)
                                           for table in this_page_result]
                    closest = numpy.argmin(distances_to_tables)
                    if distances_to_tables[closest] < max_text_distance:
                        table = this_page_result[closest]
                        table.surrounding_text_boxes.append(text_box)
                        table.surrounding_texts.append(text)

            result.extend(this_page_result)
    return result


def table_info_to_dict(tinfo):
    result = dict(page=tinfo.page,
                  surrounding_text_boxes=[b.tolist() for b in tinfo.surrounding_text_boxes] or [],
                  surrounding_texts=tinfo.surrounding_texts,
                  body_box=tinfo.body_box.tolist(),
                  success=bool(int(tinfo.success or False)))
    if tinfo.success:
        result['body'] = json.loads(tinfo.body.to_json(orient='split'), encoding='utf8')
    return result


def process_pdfs(model, files, *args, **kwargs):
    result = {}
    for file in files:
        result[file] = process_pdf(model, file, *args, **kwargs)
    return result

In [4]:
# with open('./mwh_docs/dump_2015_10_23_done_no_norm.json', 'r') as f:
#     dc_markup = json.load(f)

# dc_docs = set(os.path.basename(d['fields']['source_file']) for d in dc_markup if 'source_file' in d['fields'] and d['fields']['subject'] == 1)

# for fname in dc_docs:
#     try:
#         shutil.copy2(os.path.join('./mwh_docs/src/', fname), './mwh_docs/dc/')
#     except:
#         print(fname)

Vis


In [5]:
# %%prun -s cumulative
# process_pdfs(segmentation_model, glob.glob('/notebook/data/tmp/11955277.pdf'))

Vis2


In [5]:
# Image.fromarray(numpy.squeeze(visualize_activation(segmentation_model, 31)))

In [6]:
# for i, layer in enumerate(segmentation_model.layers):
#     print(i, layer)

Apply to PDFs from ./papers

Our model as detector


In [7]:
for fname in glob.glob('/notebook/papers/*.pdf'):
    try:
        print('Processing {}'.format(fname))
        doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
        serial_res = [table_info_to_dict(tinfo) for tinfo in doc_res]

        doc_basename = os.path.splitext(os.path.basename(fname))[0]
        with open('/notebook/papers/nn/tables/{}.json'.format(doc_basename), 'w') as f:
            json.dump(serial_res, f, indent=True)

        print('Successfully processed {}'.format(fname))
    except KeyboardInterrupt:
        break
    except:
        print('Could not process {}'.format(fname))
        print(traceback.format_exc())
        print()


Processing /notebook/papers/2007-8.pdf
Successfully processed /notebook/papers/2007-8.pdf
Processing /notebook/papers/2003-13.pdf
Successfully processed /notebook/papers/2003-13.pdf
Processing /notebook/papers/2007-3.pdf
Successfully processed /notebook/papers/2007-3.pdf
Processing /notebook/papers/2009-2.pdf
Successfully processed /notebook/papers/2009-2.pdf
Processing /notebook/papers/2012-4.pdf
Successfully processed /notebook/papers/2012-4.pdf
Processing /notebook/papers/2003-1.pdf
Successfully processed /notebook/papers/2003-1.pdf
Processing /notebook/papers/2003-3.pdf
Successfully processed /notebook/papers/2003-3.pdf
Processing /notebook/papers/2003-18.pdf
Successfully processed /notebook/papers/2003-18.pdf
Processing /notebook/papers/2003-12.pdf
Could not process /notebook/papers/2003-12.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 191, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2014-1.pdf
Could not process /notebook/papers/2014-1.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 191, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2001-10-dendritic-cell-based-xenoantigen-vaccination-for-prostate-cancer-immunotherapy.pdf
Successfully processed /notebook/papers/2001-10-dendritic-cell-based-xenoantigen-vaccination-for-prostate-cancer-immunotherapy.pdf
Processing /notebook/papers/2001-15-in-vivo-transfection-and-or-cross-priming-of-dendritic-cells-following-dna-and-adenoviral-immunizations-for-immunotherapy-of-cancer.pdf
Successfully processed /notebook/papers/2001-15-in-vivo-transfection-and-or-cross-priming-of-dendritic-cells-following-dna-and-adenoviral-immunizations-for-immunotherapy-of-cancer.pdf
Processing /notebook/papers/2012-3.pdf
Successfully processed /notebook/papers/2012-3.pdf
Processing /notebook/papers/2002-8-treatment-of-non-resectable-hepatocellular-carcinoma-with-autologous-tumor-pulsed-dendritic-cells.pdf
Successfully processed /notebook/papers/2002-8-treatment-of-non-resectable-hepatocellular-carcinoma-with-autologous-tumor-pulsed-dendritic-cells.pdf
Processing /notebook/papers/2007-11.pdf
Successfully processed /notebook/papers/2007-11.pdf
Processing /notebook/papers/2011-11.pdf
Successfully processed /notebook/papers/2011-11.pdf
Processing /notebook/papers/2008-6.pdf
Successfully processed /notebook/papers/2008-6.pdf
Processing /notebook/papers/2014-3.pdf
Successfully processed /notebook/papers/2014-3.pdf
Processing /notebook/papers/2000-18-treatment-of-solid-tumours-in-children-with-tumour-lysate-pulsed-dendritic-cells.pdf
Successfully processed /notebook/papers/2000-18-treatment-of-solid-tumours-in-children-with-tumour-lysate-pulsed-dendritic-cells.pdf
Processing /notebook/papers/2003-11.pdf
Successfully processed /notebook/papers/2003-11.pdf
Processing /notebook/papers/2009-4.pdf
Successfully processed /notebook/papers/2009-4.pdf
Processing /notebook/papers/2001-12-immune-and-clinical-responses-in-patients-with-metastatic-melanoma-to-cd34-progenitor-derived-dendritic-cell-vaccine.pdf
Successfully processed /notebook/papers/2001-12-immune-and-clinical-responses-in-patients-with-metastatic-melanoma-to-cd34-progenitor-derived-dendritic-cell-vaccine.pdf
Processing /notebook/papers/2011-3.pdf
Successfully processed /notebook/papers/2011-3.pdf
Processing /notebook/papers/2001-16-regression-of-lymph-node-metastases-by-immunotherapy-using-autologous-breast-tumor-lysate-pulsed-dendritic-cells-report-of-a-case.pdf
Successfully processed /notebook/papers/2001-16-regression-of-lymph-node-metastases-by-immunotherapy-using-autologous-breast-tumor-lysate-pulsed-dendritic-cells-report-of-a-case.pdf
Processing /notebook/papers/2004-1.pdf
Successfully processed /notebook/papers/2004-1.pdf
Processing /notebook/papers/2010-1.pdf
Successfully processed /notebook/papers/2010-1.pdf
Processing /notebook/papers/2007-2.pdf
Successfully processed /notebook/papers/2007-2.pdf
Processing /notebook/papers/2000-11-immunotherapy-of-hormone-refractory-prostate-cancer-with-antigen-loaded-dendritic-cells.pdf
Successfully processed /notebook/papers/2000-11-immunotherapy-of-hormone-refractory-prostate-cancer-with-antigen-loaded-dendritic-cells.pdf
Processing /notebook/papers/2003-16.pdf
Successfully processed /notebook/papers/2003-16.pdf
Processing /notebook/papers/2011-4.pdf
Successfully processed /notebook/papers/2011-4.pdf
Processing /notebook/papers/2002-1-a-phase-i-trial-of-tumor-lysate-pulsed-dendritic-cells-in-the-treatment-of-advanced-cancer.pdf
Successfully processed /notebook/papers/2002-1-a-phase-i-trial-of-tumor-lysate-pulsed-dendritic-cells-in-the-treatment-of-advanced-cancer.pdf
Processing /notebook/papers/2003-5.pdf
Successfully processed /notebook/papers/2003-5.pdf
Processing /notebook/papers/2010-2.pdf
Successfully processed /notebook/papers/2010-2.pdf
Processing /notebook/papers/2003-8.pdf
Successfully processed /notebook/papers/2003-8.pdf
Processing /notebook/papers/2012-2.pdf
Successfully processed /notebook/papers/2012-2.pdf
Processing /notebook/papers/2009-6.pdf
Successfully processed /notebook/papers/2009-6.pdf
Processing /notebook/papers/2003-17.pdf
Successfully processed /notebook/papers/2003-17.pdf
Processing /notebook/papers/2001-14-immunotherapy-of-bladder-cancer-using-autologous-dendritic-cells-pulsed-with-human-lymphocyte-antigen-a24-specific-mage-3-peptide.pdf
Successfully processed /notebook/papers/2001-14-immunotherapy-of-bladder-cancer-using-autologous-dendritic-cells-pulsed-with-human-lymphocyte-antigen-a24-specific-mage-3-peptide.pdf
Processing /notebook/papers/2004-3.pdf
Successfully processed /notebook/papers/2004-3.pdf
Processing /notebook/papers/2001-13-immunotherapy-for-medullary-thyroid-carcinoma-by-dendritic-cell-vaccination.pdf
Successfully processed /notebook/papers/2001-13-immunotherapy-for-medullary-thyroid-carcinoma-by-dendritic-cell-vaccination.pdf
Processing /notebook/papers/2011-5.pdf
Successfully processed /notebook/papers/2011-5.pdf
Processing /notebook/papers/2011-8.pdf
Successfully processed /notebook/papers/2011-8.pdf
Processing /notebook/papers/2003-15.pdf
Successfully processed /notebook/papers/2003-15.pdf
Processing /notebook/papers/2007-12.pdf
Successfully processed /notebook/papers/2007-12.pdf
Processing /notebook/papers/2011-10.pdf
Successfully processed /notebook/papers/2011-10.pdf
Processing /notebook/papers/2009-7.pdf
Successfully processed /notebook/papers/2009-7.pdf
Processing /notebook/papers/2002-2-dendritic-cell-immunotherapy-for-patients-with-metastatic-renal-cell-carcinoma-university-of-tokyo-experience.pdf
Successfully processed /notebook/papers/2002-2-dendritic-cell-immunotherapy-for-patients-with-metastatic-renal-cell-carcinoma-university-of-tokyo-experience.pdf
Processing /notebook/papers/2002-4-immunotherapy-of-solid-cancer-using-dendritic-cells-pulsed-with-the-hla-a24-restricted-peptide-of-carcinoembryonic-antigen.pdf
Successfully processed /notebook/papers/2002-4-immunotherapy-of-solid-cancer-using-dendritic-cells-pulsed-with-the-hla-a24-restricted-peptide-of-carcinoembryonic-antigen.pdf
Processing /notebook/papers/2007-5.pdf
Successfully processed /notebook/papers/2007-5.pdf
Processing /notebook/papers/2011-2.pdf
Successfully processed /notebook/papers/2011-2.pdf
Processing /notebook/papers/2003-7.pdf
Successfully processed /notebook/papers/2003-7.pdf
Processing /notebook/papers/2007-4.pdf
Successfully processed /notebook/papers/2007-4.pdf
Processing /notebook/papers/2002-7-the-feasibility-and-safety-of-immunotherapy-with-dendritic-cells-loaded-with-cea-mrna-following-neoadjuvant-chemoradiotherapy-and-resection-of-pancreatic-cancer.pdf
Successfully processed /notebook/papers/2002-7-the-feasibility-and-safety-of-immunotherapy-with-dendritic-cells-loaded-with-cea-mrna-following-neoadjuvant-chemoradiotherapy-and-resection-of-pancreatic-cancer.pdf
Processing /notebook/papers/2008-5.pdf
Could not process /notebook/papers/2008-5.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 194, in process_pdf
    page_filenames = pdf_to_pages(in_file, wd, pages=pages)
  File "<ipython-input-6-e3106961fe37>", line 17, in pdf_to_pages
    os.path.join(out_dir, '%04d.png')])
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/subprocess.py", line 291, in check_call
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['convert', '-define', 'pdf:use-cropbox=true', '-density', '100', '/notebook/papers/2008-5.pdf', '-sharpen', '0x1.0', '/tmp/tmpz6qyvqe6/%04d.png']' returned non-zero exit status 1.


Processing /notebook/papers/2011-12.pdf
Could not process /notebook/papers/2011-12.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 212, in process_pdf
    body = extract_table_with_tabula(in_file, page_i + 1, body_box)
  File "<ipython-input-6-e3106961fe37>", line 130, in extract_table_with_tabula
    return tabula.read_pdf(pdf_file, guess=False, pages=page_no, area=tuple(box))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/tabula/wrapper.py", line 97, in read_pdf
    return pd.read_csv(io.BytesIO(output), **pandas_options)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 655, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 411, in _read
    data = parser.read(nrows)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 1005, in read
    ret = self._engine.read(nrows)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 1748, in read
    data = self._reader.read(nrows)
  File "pandas/_libs/parsers.pyx", line 890, in pandas._libs.parsers.TextReader.read (pandas/_libs/parsers.c:10862)
  File "pandas/_libs/parsers.pyx", line 912, in pandas._libs.parsers.TextReader._read_low_memory (pandas/_libs/parsers.c:11138)
  File "pandas/_libs/parsers.pyx", line 966, in pandas._libs.parsers.TextReader._read_rows (pandas/_libs/parsers.c:11884)
  File "pandas/_libs/parsers.pyx", line 953, in pandas._libs.parsers.TextReader._tokenize_rows (pandas/_libs/parsers.c:11755)
  File "pandas/_libs/parsers.pyx", line 2184, in pandas._libs.parsers.raise_parser_error (pandas/_libs/parsers.c:28765)
pandas.errors.ParserError: Error tokenizing data. C error: Expected 2 fields in line 3, saw 3



Processing /notebook/papers/2003-9.pdf
Successfully processed /notebook/papers/2003-9.pdf
Processing /notebook/papers/2008-4.pdf
Successfully processed /notebook/papers/2008-4.pdf
Processing /notebook/papers/2004-2.pdf
Successfully processed /notebook/papers/2004-2.pdf
Processing /notebook/papers/2014-2.pdf
Successfully processed /notebook/papers/2014-2.pdf
Processing /notebook/papers/2009-5.pdf
Successfully processed /notebook/papers/2009-5.pdf
Processing /notebook/papers/2007-7.pdf
Could not process /notebook/papers/2007-7.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 191, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2011-6.pdf
Successfully processed /notebook/papers/2011-6.pdf
Processing /notebook/papers/2002-5-mucin-gene-muc1-transfected-dendritic-cells-as-vaccine-results-of-a-phase-i-ii-clinical-trial.pdf
Successfully processed /notebook/papers/2002-5-mucin-gene-muc1-transfected-dendritic-cells-as-vaccine-results-of-a-phase-i-ii-clinical-trial.pdf
Processing /notebook/papers/2002-3-immunotherapy-of-metastatic-renal-cell-carcinoma-with-tumor-lysate-pulsed-autologous-dendritic-cells.pdf
Successfully processed /notebook/papers/2002-3-immunotherapy-of-metastatic-renal-cell-carcinoma-with-tumor-lysate-pulsed-autologous-dendritic-cells.pdf
Processing /notebook/papers/2013-1.pdf
Successfully processed /notebook/papers/2013-1.pdf
Processing /notebook/papers/2007-9.pdf
Successfully processed /notebook/papers/2007-9.pdf
Processing /notebook/papers/2011-9.pdf
Successfully processed /notebook/papers/2011-9.pdf
Processing /notebook/papers/2011-1.pdf
Successfully processed /notebook/papers/2011-1.pdf
Processing /notebook/papers/2002-6-optimizing-dendritic-cell-based-immunotherapy-in-multiple-myeloma.pdf
Successfully processed /notebook/papers/2002-6-optimizing-dendritic-cell-based-immunotherapy-in-multiple-myeloma.pdf
Processing /notebook/papers/2001-9-altered-peptide-ligand-vaccination-with-flt3-ligand-expanded-dendritic-cells-for-tumor-immunotherapy.pdf
Successfully processed /notebook/papers/2001-9-altered-peptide-ligand-vaccination-with-flt3-ligand-expanded-dendritic-cells-for-tumor-immunotherapy.pdf
Processing /notebook/papers/2007-6.pdf
Successfully processed /notebook/papers/2007-6.pdf
Processing /notebook/papers/2000-17-phase-i-study-in-melanoma-patients-of-a-vaccine-with-peptide-pulsed-dendritic-cells-generated-in-vitro-from-cd34-hematopoietic-progenitor-cells.pdf
Successfully processed /notebook/papers/2000-17-phase-i-study-in-melanoma-patients-of-a-vaccine-with-peptide-pulsed-dendritic-cells-generated-in-vitro-from-cd34-hematopoietic-progenitor-cells.pdf
Processing /notebook/papers/2012-1.pdf
Could not process /notebook/papers/2012-1.pdf
Traceback (most recent call last):
  File "<ipython-input-7-036fc9801456>", line 4, in <module>
    doc_res = process_pdf(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-6-e3106961fe37>", line 191, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2008-1.pdf
Successfully processed /notebook/papers/2008-1.pdf
Processing /notebook/papers/2003-6.pdf
Successfully processed /notebook/papers/2003-6.pdf
Processing /notebook/papers/2008-3.pdf
Successfully processed /notebook/papers/2008-3.pdf
Processing /notebook/papers/2007-1-unlocked-by-wwwfreemypdfcom.pdf
Successfully processed /notebook/papers/2007-1-unlocked-by-wwwfreemypdfcom.pdf
Processing /notebook/papers/2003-2.pdf
Successfully processed /notebook/papers/2003-2.pdf

Pdffigures2 as detector


In [12]:
for fname in glob.glob('/notebook/papers/*.pdf'):
    try:
        print('Processing {}'.format(fname))
        doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
        serial_res = [table_info_to_dict(tinfo) for tinfo in doc_res]

        doc_basename = os.path.splitext(os.path.basename(fname))[0]
        with open('/notebook/papers/pf/tables/{}.json'.format(doc_basename), 'w') as f:
            json.dump(serial_res, f, indent=True)

        print('Successfully processed {}'.format(fname))
    except KeyboardInterrupt:
        break
    except:
        print('Could not process {}'.format(fname))
        print(traceback.format_exc())
        print()


Processing /notebook/papers/2007-8.pdf
Successfully processed /notebook/papers/2007-8.pdf
Processing /notebook/papers/2003-13.pdf
Successfully processed /notebook/papers/2003-13.pdf
Processing /notebook/papers/2007-3.pdf
Successfully processed /notebook/papers/2007-3.pdf
Processing /notebook/papers/2009-2.pdf
Successfully processed /notebook/papers/2009-2.pdf
Processing /notebook/papers/2012-4.pdf
Successfully processed /notebook/papers/2012-4.pdf
Processing /notebook/papers/2003-1.pdf
Successfully processed /notebook/papers/2003-1.pdf
Processing /notebook/papers/2003-3.pdf
Successfully processed /notebook/papers/2003-3.pdf
Processing /notebook/papers/2003-18.pdf
Successfully processed /notebook/papers/2003-18.pdf
Processing /notebook/papers/2003-12.pdf
Could not process /notebook/papers/2003-12.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 280, in process_pdf_pf2
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2014-1.pdf
Could not process /notebook/papers/2014-1.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 280, in process_pdf_pf2
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2001-10-dendritic-cell-based-xenoantigen-vaccination-for-prostate-cancer-immunotherapy.pdf
Successfully processed /notebook/papers/2001-10-dendritic-cell-based-xenoantigen-vaccination-for-prostate-cancer-immunotherapy.pdf
Processing /notebook/papers/2001-15-in-vivo-transfection-and-or-cross-priming-of-dendritic-cells-following-dna-and-adenoviral-immunizations-for-immunotherapy-of-cancer.pdf
Successfully processed /notebook/papers/2001-15-in-vivo-transfection-and-or-cross-priming-of-dendritic-cells-following-dna-and-adenoviral-immunizations-for-immunotherapy-of-cancer.pdf
Processing /notebook/papers/2012-3.pdf
Successfully processed /notebook/papers/2012-3.pdf
Processing /notebook/papers/2002-8-treatment-of-non-resectable-hepatocellular-carcinoma-with-autologous-tumor-pulsed-dendritic-cells.pdf
Successfully processed /notebook/papers/2002-8-treatment-of-non-resectable-hepatocellular-carcinoma-with-autologous-tumor-pulsed-dendritic-cells.pdf
Processing /notebook/papers/2007-11.pdf
Successfully processed /notebook/papers/2007-11.pdf
Processing /notebook/papers/2011-11.pdf
Successfully processed /notebook/papers/2011-11.pdf
Processing /notebook/papers/2008-6.pdf
Successfully processed /notebook/papers/2008-6.pdf
Processing /notebook/papers/2014-3.pdf
Successfully processed /notebook/papers/2014-3.pdf
Processing /notebook/papers/2000-18-treatment-of-solid-tumours-in-children-with-tumour-lysate-pulsed-dendritic-cells.pdf
Successfully processed /notebook/papers/2000-18-treatment-of-solid-tumours-in-children-with-tumour-lysate-pulsed-dendritic-cells.pdf
Processing /notebook/papers/2003-11.pdf
Successfully processed /notebook/papers/2003-11.pdf
Processing /notebook/papers/2009-4.pdf
Successfully processed /notebook/papers/2009-4.pdf
Processing /notebook/papers/2001-12-immune-and-clinical-responses-in-patients-with-metastatic-melanoma-to-cd34-progenitor-derived-dendritic-cell-vaccine.pdf
Successfully processed /notebook/papers/2001-12-immune-and-clinical-responses-in-patients-with-metastatic-melanoma-to-cd34-progenitor-derived-dendritic-cell-vaccine.pdf
Processing /notebook/papers/2011-3.pdf
Successfully processed /notebook/papers/2011-3.pdf
Processing /notebook/papers/2001-16-regression-of-lymph-node-metastases-by-immunotherapy-using-autologous-breast-tumor-lysate-pulsed-dendritic-cells-report-of-a-case.pdf
Successfully processed /notebook/papers/2001-16-regression-of-lymph-node-metastases-by-immunotherapy-using-autologous-breast-tumor-lysate-pulsed-dendritic-cells-report-of-a-case.pdf
Processing /notebook/papers/2004-1.pdf
Successfully processed /notebook/papers/2004-1.pdf
Processing /notebook/papers/2010-1.pdf
Successfully processed /notebook/papers/2010-1.pdf
Processing /notebook/papers/2007-2.pdf
Successfully processed /notebook/papers/2007-2.pdf
Processing /notebook/papers/2000-11-immunotherapy-of-hormone-refractory-prostate-cancer-with-antigen-loaded-dendritic-cells.pdf
Successfully processed /notebook/papers/2000-11-immunotherapy-of-hormone-refractory-prostate-cancer-with-antigen-loaded-dendritic-cells.pdf
Processing /notebook/papers/2003-16.pdf
Successfully processed /notebook/papers/2003-16.pdf
Processing /notebook/papers/2011-4.pdf
Successfully processed /notebook/papers/2011-4.pdf
Processing /notebook/papers/2002-1-a-phase-i-trial-of-tumor-lysate-pulsed-dendritic-cells-in-the-treatment-of-advanced-cancer.pdf
Successfully processed /notebook/papers/2002-1-a-phase-i-trial-of-tumor-lysate-pulsed-dendritic-cells-in-the-treatment-of-advanced-cancer.pdf
Processing /notebook/papers/2003-5.pdf
Successfully processed /notebook/papers/2003-5.pdf
Processing /notebook/papers/2010-2.pdf
Successfully processed /notebook/papers/2010-2.pdf
Processing /notebook/papers/2003-8.pdf
Successfully processed /notebook/papers/2003-8.pdf
Processing /notebook/papers/2012-2.pdf
Successfully processed /notebook/papers/2012-2.pdf
Processing /notebook/papers/2009-6.pdf
Successfully processed /notebook/papers/2009-6.pdf
Processing /notebook/papers/2003-17.pdf
Successfully processed /notebook/papers/2003-17.pdf
Processing /notebook/papers/2001-14-immunotherapy-of-bladder-cancer-using-autologous-dendritic-cells-pulsed-with-human-lymphocyte-antigen-a24-specific-mage-3-peptide.pdf
Successfully processed /notebook/papers/2001-14-immunotherapy-of-bladder-cancer-using-autologous-dendritic-cells-pulsed-with-human-lymphocyte-antigen-a24-specific-mage-3-peptide.pdf
Processing /notebook/papers/2004-3.pdf
Successfully processed /notebook/papers/2004-3.pdf
Processing /notebook/papers/2001-13-immunotherapy-for-medullary-thyroid-carcinoma-by-dendritic-cell-vaccination.pdf
Successfully processed /notebook/papers/2001-13-immunotherapy-for-medullary-thyroid-carcinoma-by-dendritic-cell-vaccination.pdf
Processing /notebook/papers/2011-5.pdf
Successfully processed /notebook/papers/2011-5.pdf
Processing /notebook/papers/2011-8.pdf
Successfully processed /notebook/papers/2011-8.pdf
Processing /notebook/papers/2003-15.pdf
Successfully processed /notebook/papers/2003-15.pdf
Processing /notebook/papers/2007-12.pdf
Successfully processed /notebook/papers/2007-12.pdf
Processing /notebook/papers/2011-10.pdf
Successfully processed /notebook/papers/2011-10.pdf
Processing /notebook/papers/2009-7.pdf
Successfully processed /notebook/papers/2009-7.pdf
Processing /notebook/papers/2002-2-dendritic-cell-immunotherapy-for-patients-with-metastatic-renal-cell-carcinoma-university-of-tokyo-experience.pdf
Successfully processed /notebook/papers/2002-2-dendritic-cell-immunotherapy-for-patients-with-metastatic-renal-cell-carcinoma-university-of-tokyo-experience.pdf
Processing /notebook/papers/2002-4-immunotherapy-of-solid-cancer-using-dendritic-cells-pulsed-with-the-hla-a24-restricted-peptide-of-carcinoembryonic-antigen.pdf
Successfully processed /notebook/papers/2002-4-immunotherapy-of-solid-cancer-using-dendritic-cells-pulsed-with-the-hla-a24-restricted-peptide-of-carcinoembryonic-antigen.pdf
Processing /notebook/papers/2007-5.pdf
Successfully processed /notebook/papers/2007-5.pdf
Processing /notebook/papers/2011-2.pdf
Successfully processed /notebook/papers/2011-2.pdf
Processing /notebook/papers/2003-7.pdf
Successfully processed /notebook/papers/2003-7.pdf
Processing /notebook/papers/2007-4.pdf
Successfully processed /notebook/papers/2007-4.pdf
Processing /notebook/papers/2002-7-the-feasibility-and-safety-of-immunotherapy-with-dendritic-cells-loaded-with-cea-mrna-following-neoadjuvant-chemoradiotherapy-and-resection-of-pancreatic-cancer.pdf
Successfully processed /notebook/papers/2002-7-the-feasibility-and-safety-of-immunotherapy-with-dendritic-cells-loaded-with-cea-mrna-following-neoadjuvant-chemoradiotherapy-and-resection-of-pancreatic-cancer.pdf
Processing /notebook/papers/2008-5.pdf
Could not process /notebook/papers/2008-5.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 283, in process_pdf_pf2
    page_filenames = pdf_to_pages(in_file, wd, pages=pages)
  File "<ipython-input-11-a636097a9443>", line 17, in pdf_to_pages
    os.path.join(out_dir, '%04d.png')])
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/subprocess.py", line 291, in check_call
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['convert', '-define', 'pdf:use-cropbox=true', '-density', '100', '/notebook/papers/2008-5.pdf', '-sharpen', '0x1.0', '/tmp/tmp6bqcowzh/%04d.png']' returned non-zero exit status 1.


Processing /notebook/papers/2011-12.pdf
Successfully processed /notebook/papers/2011-12.pdf
Processing /notebook/papers/2003-9.pdf
Successfully processed /notebook/papers/2003-9.pdf
Processing /notebook/papers/2008-4.pdf
Successfully processed /notebook/papers/2008-4.pdf
Processing /notebook/papers/2004-2.pdf
Successfully processed /notebook/papers/2004-2.pdf
Processing /notebook/papers/2014-2.pdf
Successfully processed /notebook/papers/2014-2.pdf
Processing /notebook/papers/2009-5.pdf
Successfully processed /notebook/papers/2009-5.pdf
Processing /notebook/papers/2007-7.pdf
Could not process /notebook/papers/2007-7.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 280, in process_pdf_pf2
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2011-6.pdf
Successfully processed /notebook/papers/2011-6.pdf
Processing /notebook/papers/2002-5-mucin-gene-muc1-transfected-dendritic-cells-as-vaccine-results-of-a-phase-i-ii-clinical-trial.pdf
Successfully processed /notebook/papers/2002-5-mucin-gene-muc1-transfected-dendritic-cells-as-vaccine-results-of-a-phase-i-ii-clinical-trial.pdf
Processing /notebook/papers/2002-3-immunotherapy-of-metastatic-renal-cell-carcinoma-with-tumor-lysate-pulsed-autologous-dendritic-cells.pdf
Successfully processed /notebook/papers/2002-3-immunotherapy-of-metastatic-renal-cell-carcinoma-with-tumor-lysate-pulsed-autologous-dendritic-cells.pdf
Processing /notebook/papers/2013-1.pdf
Successfully processed /notebook/papers/2013-1.pdf
Processing /notebook/papers/2007-9.pdf
Successfully processed /notebook/papers/2007-9.pdf
Processing /notebook/papers/2011-9.pdf
Successfully processed /notebook/papers/2011-9.pdf
Processing /notebook/papers/2011-1.pdf
Successfully processed /notebook/papers/2011-1.pdf
Processing /notebook/papers/2002-6-optimizing-dendritic-cell-based-immunotherapy-in-multiple-myeloma.pdf
Successfully processed /notebook/papers/2002-6-optimizing-dendritic-cell-based-immunotherapy-in-multiple-myeloma.pdf
Processing /notebook/papers/2001-9-altered-peptide-ligand-vaccination-with-flt3-ligand-expanded-dendritic-cells-for-tumor-immunotherapy.pdf
Successfully processed /notebook/papers/2001-9-altered-peptide-ligand-vaccination-with-flt3-ligand-expanded-dendritic-cells-for-tumor-immunotherapy.pdf
Processing /notebook/papers/2007-6.pdf
Successfully processed /notebook/papers/2007-6.pdf
Processing /notebook/papers/2000-17-phase-i-study-in-melanoma-patients-of-a-vaccine-with-peptide-pulsed-dendritic-cells-generated-in-vitro-from-cd34-hematopoietic-progenitor-cells.pdf
Successfully processed /notebook/papers/2000-17-phase-i-study-in-melanoma-patients-of-a-vaccine-with-peptide-pulsed-dendritic-cells-generated-in-vitro-from-cd34-hematopoietic-progenitor-cells.pdf
Processing /notebook/papers/2012-1.pdf
Could not process /notebook/papers/2012-1.pdf
Traceback (most recent call last):
  File "<ipython-input-12-afd191cdf3c8>", line 4, in <module>
    doc_res = process_pdf_pf2(segmentation_model, fname, return_only_successful=True)
  File "<ipython-input-11-a636097a9443>", line 280, in process_pdf_pf2
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str


Processing /notebook/papers/2008-1.pdf
Successfully processed /notebook/papers/2008-1.pdf
Processing /notebook/papers/2003-6.pdf
Successfully processed /notebook/papers/2003-6.pdf
Processing /notebook/papers/2008-3.pdf
Successfully processed /notebook/papers/2008-3.pdf
Processing /notebook/papers/2007-1-unlocked-by-wwwfreemypdfcom.pdf
Successfully processed /notebook/papers/2007-1-unlocked-by-wwwfreemypdfcom.pdf
Processing /notebook/papers/2003-2.pdf
Successfully processed /notebook/papers/2003-2.pdf

Evaluate


In [12]:
def rect_color_to_class(color):
    color = color.lower()
    if color in ('ff0000', '00ff00'):
        return 'surrounding_text'
    else:
        return 'body'


def get_svg_rectangles(svg_file):
    rects_by_class = collections.defaultdict(list)
    with open(svg_file, 'r') as f:
        tree = lxml.etree.parse(f)
        for n in list(tree.getroot().getchildren()):
            if n.tag.endswith('rect'):
                stroke = get_stroke(n)
                x = float(n.get('x'))
                y = float(n.get('y'))
                w = float(n.get('width'))
                h = float(n.get('height'))
                rects_by_class[rect_color_to_class(stroke)].append(numpy.array([y, x, y+h, x+w]))
    return rects_by_class


def rect_area(rect):
    y1, x1, y2, x2 = rect
    return (y2 - y1) * (x2 - x1)


def rect_intersection(a, b):
    ay1, ax1, ay2, ax2 = a
    by1, bx1, by2, bx2 = b
    iy1 = max(ay1, by1)
    ix1 = max(ax1, bx1)
    iy2 = min(ay2, by2)
    ix2 = min(ax2, bx2)
    if ix2 - ix1 <= 0 or iy2 - iy1 <= 0:
        return None
    return numpy.array([iy1, ix1, iy2, ix2])


def rect_union(a, b):
    ay1, ax1, ay2, ax2 = a
    by1, bx1, by2, bx2 = b
    return numpy.array([min(ay1, by1),
                        min(ax1, bx1),
                        max(ay2, by2),
                        max(ax2, bx2)])


def rect_dice(a, b):
    intersect = rect_intersection(a, b)
    if not intersect is None:
        return rect_area(intersect) / rect_area(rect_union(a, b))
    else:
        return 0


def calc_page_stat(gold_rects, found_rects, min_tp_dice=0.1):
    fp = 0
    tp = 0
    sum_tp_dice = 0
    full_dice = 0
    found_gr = set()

    for fr in found_rects:
        best_dice = 0
        best_gr_i = None
        for i, gr in enumerate(gold_rects):
            dice = rect_dice(fr, gr)
            if dice > best_dice:
                best_dice = dice
                best_gr_i = i

        if best_gr_i is None or best_dice < min_tp_dice:
            fp += 1
        else:
            tp += 1
            sum_tp_dice += best_dice
            found_gr.add(best_gr_i)
        full_dice += best_dice

    return dict(found_n=len(found_rects),
                gold_n=len(gold_rects),
                fp=fp,
                tp=tp,
                fn=len(gold_rects) - len(found_gr),
                tp_dice=sum_tp_dice,
                full_dice=full_dice)


def eval_on_single_doc(file, gold_rect_getter, rect_getter, min_tp_dice=0.1):
    gold_rects = gold_rect_getter(file)
    dirty_rects = rect_getter(file)
    classes = set(gold_rects.keys()) | set(dirty_rects.keys())
    return pandas.concat([pandas.Series(calc_page_stat(gold_rects.get(cls, []),
                                                       dirty_rects.get(cls, []),
                                                       min_tp_dice=min_tp_dice))
                          for cls in classes],
                         keys=classes)


def aggregate_df_metrics(df):
    result = []
    categories = set(df.columns.get_level_values(0))
    for category in categories:
        cat_metrics = df.xs(category, axis=1, level=0)

        non_empty_docs = cat_metrics[['gold_n', 'found_n']].sum(axis=1) > 0
        cat_metrics = cat_metrics[non_empty_docs]

        cat_metrics.tp_dice /= cat_metrics.tp
        cat_metrics.full_dice /= cat_metrics.found_n
        cat_metrics.fp /= cat_metrics.found_n
        cat_metrics.tp /= cat_metrics.found_n
        cat_metrics.fn /= cat_metrics.gold_n

        cat_metrics['prec'] = cat_metrics.tp / (cat_metrics.tp + cat_metrics.fp)
        cat_metrics['rec'] = cat_metrics.tp / (cat_metrics.tp + cat_metrics.fn)
        cat_metrics['f1'] = 2 * cat_metrics.prec * cat_metrics.rec / (cat_metrics.prec + cat_metrics.rec)

        result.append(cat_metrics['gold_n found_n f1 prec rec fp tp fn full_dice tp_dice'.split(' ')])

    result = pandas.concat(result,
                           axis=1,
                           keys=categories)
    result.replace([numpy.inf, -numpy.inf], numpy.nan, inplace=True)
    result.fillna(0, inplace=True)
    return result


def bootstrap_micro(raw_metrics, n=10):
    subsampled_metrics = []
    for _ in range(n):
        subsample = raw_metrics.sample(frac=1, replace=True, axis=0)
        sub_micro = aggregate_df_metrics(subsample.sum(axis=0).to_frame().transpose())
        subsampled_metrics.append(sub_micro)
    return pandas.concat(subsampled_metrics, axis=0)


def calc_metrics_per_dir(in_dir, gold_rect_getter, rect_getter, extension='svg', min_tp_dice=0.1):
    metrics = []
    for f in list(glob.glob(os.path.join(in_dir, '*.{}'.format(extension)))):
        try:
            metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
        except BaseException as e:
            print('Could not eval on {} due to {}'.format(f, e))
            print(traceback.format_exc())

    metrics = pandas.DataFrame(metrics)
    macro = aggregate_df_metrics(metrics).describe().loc[['mean', 'std']]
    micro = bootstrap_micro(metrics).describe().loc[['mean', 'std']]

    result = pandas.concat([macro, micro],
                            axis=1,
                            keys=['macro', 'micro'])
#     result = result.reorder_levels([1, 2, 0], axis=1)
#     result.sort_index(level=0, axis=1, inplace=True)
    return result


def make_nn_rect_getter(model, **process_kwargs):
    def _impl(svg_file):
        with tempfile.TemporaryDirectory() as wd:
            in_fname, _ = convert_svg(svg_file, wd)
            boxes = process_page(model, in_fname, wd, **process_kwargs)
            return dict(body=[box for channel, box in boxes if channel == 1],
                        surrounding_text=[box for channel, box in boxes if channel == 0])
    return _impl


def get_pdf_file_and_page(pdfs_dir, svg_file, page_no_one_based=False):
    # take first and last elements of splitted filename
    doc_id, *_, page_no = os.path.splitext(os.path.basename(svg_file))[0].split('-')
    page_no = int(page_no) # expected to be zero-based
    if page_no_one_based:
        page_no -= 1
    return os.path.join(pdfs_dir, doc_id + '.pdf'), page_no


def make_pure_nn_rect_getter(model, **process_kwargs):
    def _impl(svg_file):
        with tempfile.TemporaryDirectory() as wd:
            in_fname, _ = convert_svg(svg_file, wd)
            boxes = process_page(model, in_fname, wd, **process_kwargs)
            return dict(body=[box for channel, box in boxes if channel == 1],
                        surrounding_text=[box for channel, box in boxes if channel == 0])
    return _impl


def make_nn_tabula_rect_getter(model, pdfs_dir, page_no_one_based=False, **process_kwargs):
    def _impl(svg_file):
        with tempfile.TemporaryDirectory() as wd:
            pdf_fname, page_no = get_pdf_file_and_page(pdfs_dir, svg_file, page_no_one_based=page_no_one_based)
            tinfos = process_pdf(model, pdf_fname, pages=[page_no])
            return dict(body=[tinfo.body_box for tinfo in tinfos],
                        surrounding_text=[box
                                          for tinfo in tinfos
                                          for box in tinfo.surrounding_text_boxes])
    return _impl


def tabula_is_bad_cell(cell_info):
    return cell_info['height'] == 0 or cell_info['width'] == 0


def get_tabula_table_rect(tabula_table_info):
    top = 1e100
    left = 1e100
    bottom = 0
    right = 0
    for row in tabula_table_info['data']:
        for cell in row:
            if tabula_is_bad_cell(cell):
                continue
            if cell['top'] < top:
                top = cell['top']
            if cell['left'] < left:
                left = cell['left']
            cell_bottom = cell['top'] + cell['height']
            if cell_bottom > bottom:
                bottom = cell_bottom
            cell_right = cell['left'] + cell['width']
            if cell_right > right:
                right = cell_right
    return (top, left, bottom, right)


def make_pure_tabula_rect_getter(pdfs_dir, **tabula_kwargs):
    def _impl(svg_file):
        pdf_fname, page_no = get_pdf_file_and_page(pdfs_dir, svg_file)
        tabula_res = tabula.read_pdf(pdf_fname, output_format='json', pages=page_no + 1)
        return dict(body=[get_tabula_table_rect(table) for table in tabula_res])
    return _impl


def make_pdffigures_rect_getter(pdffigures_res_dir, **tabula_kwargs):
    def _impl(svg_file):
        pdf_fname, page_no = get_pdf_file_and_page(pdffigures_res_dir, svg_file)
        json_fname = os.path.splitext(pdf_fname)[0] + '.json'
        with open(json_fname, 'r') as f:
            pdffigures_res = json.load(f)
        page_tables = [fig for fig in pdffigures_res
                       if fig['figType'] == 'Table'
                       and fig['page'] == page_no] # pdffigures2 uses 0-based page numbering
        return dict(body=[pdffigures_format_rect_from_dict(t['regionBoundary']) for t in page_tables],
                    surrounding_text=[pdffigures_format_rect_from_dict(t['captionBoundary']) for t in page_tables])
    return _impl

Evaluate pure model


In [5]:
pure_nn_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    fold_model = segmentation_model = load_model(os.path.join(fold, 'model'),
                                                 custom_objects=model.__dict__)
    pure_nn_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                                    get_svg_rectangles,
                                                    make_pure_nn_rect_getter(fold_model),
                                                    min_tp_dice=0.1))
pure_nn_eval_agg = pandas.DataFrame([r.loc['mean'] for r in pure_nn_eval_result]).describe()
pure_nn_eval_agg.transpose()[['mean', 'std']]


Out[5]:
mean std
macro body gold_n 1.141149 0.114382
found_n 1.401379 0.223328
f1 0.903669 0.056467
prec 0.880548 0.069092
rec 0.966590 0.024446
fp 0.119452 0.069092
tp 0.880548 0.069092
fn 0.023448 0.021617
full_dice 0.812553 0.075504
tp_dice 0.903935 0.033206
surrounding_text gold_n 1.993793 0.175679
found_n 2.220460 0.163783
f1 0.883068 0.067647
prec 0.869180 0.076181
rec 0.927671 0.057664
fp 0.130820 0.076181
tp 0.869180 0.076181
fn 0.070284 0.060733
full_dice 0.669518 0.096178
tp_dice 0.736466 0.069719
micro body gold_n 33.820000 3.742593
found_n 41.480000 6.674354
f1 0.862285 0.081962
prec 0.796526 0.117163
rec 0.949871 0.054951
fp 0.203474 0.117163
tp 0.796526 0.117163
fn 0.042554 0.044005
full_dice 0.732024 0.122049
tp_dice 0.915295 0.025316
surrounding_text gold_n 58.260000 5.848333
found_n 65.020000 5.477865
f1 0.869754 0.069923
prec 0.833570 0.096253
rec 0.914769 0.056330
fp 0.166430 0.096253
tp 0.833570 0.096253
fn 0.078811 0.053997
full_dice 0.632733 0.110706
tp_dice 0.754627 0.052377

Evaluate model+tabula


In [6]:
nn_tabula_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    fold_model = segmentation_model = load_model(os.path.join(fold, 'model'),
                                                 custom_objects=model.__dict__)
    nn_tabula_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                                      get_svg_rectangles,
                                                      make_nn_tabula_rect_getter(fold_model, '/notebook/data/0_source_pdfs/'),
                                                      min_tp_dice=0.1))
nn_tabula_eval_agg = pandas.DataFrame([r.loc['mean'] for r in nn_tabula_eval_result]).describe()
nn_tabula_eval_agg.transpose()[['mean', 'std']]


Could not eval on /notebook/data/6_eval/1/test/17197902-0003.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/1/test/18207305-0002.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/1/test/18207305-0001.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/1/test/17669561-0007.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/3/test/17274750-0002.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/3/test/18207305-0003.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/4/test/17274750-0001.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/4/test/17669561-0003.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/4/test/17669561-0004.svg due to can't concat bytes to str
Could not eval on /notebook/data/6_eval/0/test/18950932-0001.svg due to 
Out[6]:
mean std
macro body gold_n 1.151319 0.107257
found_n 1.087912 0.080378
f1 0.937015 0.040542
prec 0.945661 0.037127
rec 0.934056 0.038731
fp 0.025979 0.015417
tp 0.945661 0.037127
fn 0.066876 0.047938
full_dice 0.846998 0.035813
tp_dice 0.851393 0.030730
surrounding_text gold_n 1.994347 0.149263
found_n 0.730411 0.106578
f1 0.483515 0.067686
prec 0.571302 0.076951
rec 0.425158 0.062340
fp 0.060484 0.033095
tp 0.571302 0.076951
fn 0.641930 0.064289
full_dice 0.370886 0.052942
tp_dice 0.372481 0.054502
micro body gold_n 32.300000 3.369718
found_n 30.460000 2.117310
f1 0.942167 0.038871
prec 0.965468 0.027427
rec 0.921209 0.051651
fp 0.034532 0.027427
tp 0.965468 0.027427
fn 0.085168 0.058078
full_dice 0.862897 0.033524
tp_dice 0.894209 0.039020
surrounding_text gold_n 55.620000 4.053024
found_n 20.120000 3.920077
f1 0.703576 0.039193
prec 0.906132 0.045132
rec 0.575580 0.036015
fp 0.093868 0.045132
tp 0.906132 0.045132
fn 0.668813 0.073337
full_dice 0.591374 0.072365
tp_dice 0.650999 0.059610

Evaluate model+tabula - full train


In [21]:
nn_tabula_full_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    nn_tabula_full_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                      get_svg_rectangles,
                                      make_nn_tabula_rect_getter(segmentation_model, '/notebook/data/0_source_pdfs/'),
                                      min_tp_dice=0.1))
nn_tabula_full_eval_agg = pandas.DataFrame([r.loc['mean'] for r in nn_tabula_full_eval_result]).describe()
nn_tabula_full_eval_agg.transpose()[['mean', 'std']]


Could not eval on /notebook/data/6_eval/1/test/17197902-0003.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/1/test/19794983-0003.svg due to Error tokenizing data. C error: Expected 1 fields in line 6, saw 2

Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 192, in process_pdf
    body = extract_table_with_tabula(in_file, page_i + 1, body_box)
  File "<ipython-input-3-9f22be994290>", line 113, in extract_table_with_tabula
    return tabula.read_pdf(pdf_file, guess=False, pages=page_no, area=tuple(box))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/tabula/wrapper.py", line 97, in read_pdf
    return pd.read_csv(io.BytesIO(output), **pandas_options)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 655, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 411, in _read
    data = parser.read(nrows)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 1005, in read
    ret = self._engine.read(nrows)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pandas/io/parsers.py", line 1748, in read
    data = self._reader.read(nrows)
  File "pandas/_libs/parsers.pyx", line 890, in pandas._libs.parsers.TextReader.read (pandas/_libs/parsers.c:10862)
  File "pandas/_libs/parsers.pyx", line 912, in pandas._libs.parsers.TextReader._read_low_memory (pandas/_libs/parsers.c:11138)
  File "pandas/_libs/parsers.pyx", line 966, in pandas._libs.parsers.TextReader._read_rows (pandas/_libs/parsers.c:11884)
  File "pandas/_libs/parsers.pyx", line 953, in pandas._libs.parsers.TextReader._tokenize_rows (pandas/_libs/parsers.c:11755)
  File "pandas/_libs/parsers.pyx", line 2184, in pandas._libs.parsers.raise_parser_error (pandas/_libs/parsers.c:28765)
pandas.errors.ParserError: Error tokenizing data. C error: Expected 1 fields in line 6, saw 2


Could not eval on /notebook/data/6_eval/1/test/18207305-0002.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/1/test/18207305-0001.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/1/test/17669561-0007.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/3/test/17274750-0002.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/3/test/18207305-0003.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/4/test/17274750-0001.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/4/test/17669561-0003.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/4/test/17669561-0004.svg due to can't concat bytes to str
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 174, in process_pdf
    parsed_pdf.load()
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 385, in load
    self.tree = self.get_tree(*_flatten(page_numbers))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 490, in get_tree
    page.set('page_label', self.doc.get_page_number(n))
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 271, in get_page_number
    page_label = label_format['P']+page_label
TypeError: can't concat bytes to str

Could not eval on /notebook/data/6_eval/0/test/18950932-0001.svg due to 
Traceback (most recent call last):
  File "<ipython-input-12-10700eff9ca3>", line 144, in calc_metrics_per_dir
    metrics.append(eval_on_single_doc(f, gold_rect_getter, rect_getter, min_tp_dice=min_tp_dice))
  File "<ipython-input-12-10700eff9ca3>", line 93, in eval_on_single_doc
    dirty_rects = rect_getter(file)
  File "<ipython-input-12-10700eff9ca3>", line 194, in _impl
    tinfos = process_pdf(model, pdf_fname, pages=[page_no])
  File "<ipython-input-3-9f22be994290>", line 180, in process_pdf
    page = parsed_pdf.get_page(page_i)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 595, in get_page
    return self._cached_pages(target_page=page_number)
  File "/root/.pyenv/versions/3.6.0/lib/python3.6/site-packages/pdfquery/pdfquery.py", line 627, in _cached_pages
    next_page = next(self._pages_iter)
StopIteration

Out[21]:
mean std
macro body gold_n 1.160549 0.095025
found_n 1.052095 0.098655
f1 0.925284 0.060907
prec 0.935260 0.045418
rec 0.918657 0.070972
fp 0.013810 0.018928
tp 0.935260 0.045418
fn 0.089788 0.085035
full_dice 0.855598 0.062039
tp_dice 0.855598 0.062039
surrounding_text gold_n 2.008945 0.126738
found_n 0.675626 0.081495
f1 0.450365 0.057326
prec 0.528766 0.066409
rec 0.398208 0.051894
fp 0.055359 0.026994
tp 0.528766 0.066409
fn 0.667816 0.049357
full_dice 0.416132 0.055835
tp_dice 0.418778 0.052229
micro body gold_n 31.740000 3.444271
found_n 28.940000 2.725436
f1 0.950101 0.046783
prec 0.985612 0.024365
rec 0.920442 0.078647
fp 0.014388 0.024365
tp 0.985612 0.024365
fn 0.093861 0.101933
full_dice 0.900556 0.029966
tp_dice 0.913698 0.021993
surrounding_text gold_n 54.380000 4.580611
found_n 18.620000 3.425201
f1 0.695261 0.041947
prec 0.901733 0.053354
rec 0.566132 0.034993
fp 0.098267 0.053354
tp 0.901733 0.053354
fn 0.690549 0.056853
full_dice 0.715125 0.050853
tp_dice 0.792002 0.017137

Evaluate pure pdffigures2


In [15]:
pdffigures_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    pdffigures_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                                       get_svg_rectangles,
                                                       make_pdffigures_rect_getter('/notebook/data/1_pdffigures2_out/'),
                                                       min_tp_dice=0.1))
pdffigures_eval_agg = pandas.DataFrame([r.loc['mean'] for r in pdffigures_eval_result]).describe()
pdffigures_eval_agg.transpose()[['mean', 'std']]


Out[15]:
mean std
macro body gold_n 1.133333 0.113039
found_n 1.093333 0.064118
f1 0.932111 0.040186
prec 0.943333 0.046547
rec 0.926222 0.037921
fp 0.056667 0.046547
tp 0.943333 0.046547
fn 0.064444 0.036977
full_dice 0.808275 0.064085
tp_dice 0.811512 0.069472
surrounding_text gold_n 1.980000 0.170945
found_n 1.093333 0.064118
f1 0.759039 0.033478
prec 0.906667 0.043461
rec 0.661135 0.031272
fp 0.093333 0.043461
tp 0.906667 0.043461
fn 0.443889 0.054569
full_dice 0.380587 0.035512
tp_dice 0.380015 0.035703
micro body gold_n 34.280000 3.631391
found_n 33.180000 3.016123
f1 0.937248 0.040129
prec 0.950276 0.044156
rec 0.926139 0.050250
fp 0.049724 0.044156
tp 0.950276 0.044156
fn 0.077997 0.055654
full_dice 0.805343 0.065503
tp_dice 0.846678 0.034291
surrounding_text gold_n 59.940000 6.010241
found_n 33.180000 3.016123
f1 0.759332 0.050709
prec 0.914411 0.070916
rec 0.649574 0.038925
fp 0.085589 0.070916
tp 0.914411 0.070916
fn 0.491731 0.045517
full_dice 0.380925 0.041536
tp_dice 0.415237 0.020917

Evaluate pure tabula


In [13]:
pure_tabula_eval_result = []
for fold in list(glob.glob('/notebook/data/6_eval/*')):
    pure_tabula_eval_result.append(calc_metrics_per_dir(os.path.join(fold, 'test'),
                                                        get_svg_rectangles,
                                                        make_pure_tabula_rect_getter('/notebook/data/0_source_pdfs/'),
                                                        min_tp_dice=0.1))
pure_tabula_eval_agg = pandas.DataFrame([r.loc['mean'] for r in pure_tabula_eval_result]).describe()
pure_tabula_eval_agg.transpose()[['mean', 'std']]


Out[13]:
mean std
macro body gold_n 1.147816 0.101968
found_n 1.164138 0.149322
f1 0.612304 0.038001
prec 0.589272 0.038479
rec 0.681245 0.052947
fp 0.256015 0.078403
tp 0.589272 0.038479
fn 0.322567 0.064521
full_dice 0.382755 0.045143
tp_dice 0.464457 0.055415
surrounding_text gold_n 2.005747 0.154252
found_n 0.000000 0.000000
f1 0.000000 0.000000
prec 0.000000 0.000000
rec 0.000000 0.000000
fp 0.000000 0.000000
tp 0.000000 0.000000
fn 0.993103 0.015421
full_dice 0.000000 0.000000
tp_dice 0.000000 0.000000
micro body gold_n 34.120000 2.981946
found_n 34.800000 4.403408
f1 0.657371 0.059450
prec 0.661827 0.086219
rec 0.659942 0.051743
fp 0.338173 0.086219
tp 0.661827 0.086219
fn 0.346269 0.077654
full_dice 0.435130 0.070421
tp_dice 0.653864 0.057612
surrounding_text gold_n 59.300000 4.669047
found_n 0.000000 0.000000
f1 0.000000 0.000000
prec 0.000000 0.000000
rec 0.000000 0.000000
fp 0.000000 0.000000
tp 0.000000 0.000000
fn 1.000000 0.000000
full_dice 0.000000 0.000000
tp_dice 0.000000 0.000000

Summary


In [18]:
summary_our_dataset = pandas.concat([
                                     pure_nn_eval_agg.transpose()[['mean', 'std']],
                                     nn_tabula_eval_agg.transpose()[['mean', 'std']],
                                     pdffigures_eval_agg.transpose()[['mean', 'std']],
                                     pure_tabula_eval_agg.transpose()[['mean', 'std']],
                                     ],
                                    axis=1,
                                    keys=[
                                          'pure_nn',
                                          'nn_tabula',
                                          'pdffigures2',
                                          'pure_tabula'
                                          ])
summary_our_dataset


Out[18]:
pure_nn nn_tabula pdffigures2 pure_tabula
mean std mean std mean std mean std
macro body gold_n 1.141149 0.114382 1.151319 0.107257 1.133333 0.113039 1.147816 0.101968
found_n 1.401379 0.223328 1.087912 0.080378 1.093333 0.064118 1.164138 0.149322
f1 0.903669 0.056467 0.937015 0.040542 0.932111 0.040186 0.612304 0.038001
prec 0.880548 0.069092 0.945661 0.037127 0.943333 0.046547 0.589272 0.038479
rec 0.966590 0.024446 0.934056 0.038731 0.926222 0.037921 0.681245 0.052947
fp 0.119452 0.069092 0.025979 0.015417 0.056667 0.046547 0.256015 0.078403
tp 0.880548 0.069092 0.945661 0.037127 0.943333 0.046547 0.589272 0.038479
fn 0.023448 0.021617 0.066876 0.047938 0.064444 0.036977 0.322567 0.064521
full_dice 0.812553 0.075504 0.846998 0.035813 0.808275 0.064085 0.382755 0.045143
tp_dice 0.903935 0.033206 0.851393 0.030730 0.811512 0.069472 0.464457 0.055415
surrounding_text gold_n 1.993793 0.175679 1.994347 0.149263 1.980000 0.170945 2.005747 0.154252
found_n 2.220460 0.163783 0.730411 0.106578 1.093333 0.064118 0.000000 0.000000
f1 0.883068 0.067647 0.483515 0.067686 0.759039 0.033478 0.000000 0.000000
prec 0.869180 0.076181 0.571302 0.076951 0.906667 0.043461 0.000000 0.000000
rec 0.927671 0.057664 0.425158 0.062340 0.661135 0.031272 0.000000 0.000000
fp 0.130820 0.076181 0.060484 0.033095 0.093333 0.043461 0.000000 0.000000
tp 0.869180 0.076181 0.571302 0.076951 0.906667 0.043461 0.000000 0.000000
fn 0.070284 0.060733 0.641930 0.064289 0.443889 0.054569 0.993103 0.015421
full_dice 0.669518 0.096178 0.370886 0.052942 0.380587 0.035512 0.000000 0.000000
tp_dice 0.736466 0.069719 0.372481 0.054502 0.380015 0.035703 0.000000 0.000000
micro body gold_n 33.820000 3.742593 32.300000 3.369718 34.280000 3.631391 34.120000 2.981946
found_n 41.480000 6.674354 30.460000 2.117310 33.180000 3.016123 34.800000 4.403408
f1 0.862285 0.081962 0.942167 0.038871 0.937248 0.040129 0.657371 0.059450
prec 0.796526 0.117163 0.965468 0.027427 0.950276 0.044156 0.661827 0.086219
rec 0.949871 0.054951 0.921209 0.051651 0.926139 0.050250 0.659942 0.051743
fp 0.203474 0.117163 0.034532 0.027427 0.049724 0.044156 0.338173 0.086219
tp 0.796526 0.117163 0.965468 0.027427 0.950276 0.044156 0.661827 0.086219
fn 0.042554 0.044005 0.085168 0.058078 0.077997 0.055654 0.346269 0.077654
full_dice 0.732024 0.122049 0.862897 0.033524 0.805343 0.065503 0.435130 0.070421
tp_dice 0.915295 0.025316 0.894209 0.039020 0.846678 0.034291 0.653864 0.057612
surrounding_text gold_n 58.260000 5.848333 55.620000 4.053024 59.940000 6.010241 59.300000 4.669047
found_n 65.020000 5.477865 20.120000 3.920077 33.180000 3.016123 0.000000 0.000000
f1 0.869754 0.069923 0.703576 0.039193 0.759332 0.050709 0.000000 0.000000
prec 0.833570 0.096253 0.906132 0.045132 0.914411 0.070916 0.000000 0.000000
rec 0.914769 0.056330 0.575580 0.036015 0.649574 0.038925 0.000000 0.000000
fp 0.166430 0.096253 0.093868 0.045132 0.085589 0.070916 0.000000 0.000000
tp 0.833570 0.096253 0.906132 0.045132 0.914411 0.070916 0.000000 0.000000
fn 0.078811 0.053997 0.668813 0.073337 0.491731 0.045517 1.000000 0.000000
full_dice 0.632733 0.110706 0.591374 0.072365 0.380925 0.041536 0.000000 0.000000
tp_dice 0.754627 0.052377 0.650999 0.059610 0.415237 0.020917 0.000000 0.000000

Evaluate on pdffigures2 datasets


In [12]:
# !apt-get update && apt-get install -yqq poppler-utils
# !python /notebook/pdffigures2/evaluation/download_from_urls.py -g -c

Their dataset, our pipeline


In [9]:
def pdffigures_format_rect_from_list(lst):
    x1, y1, x2, y2 = lst
    return (y1, x1, y2, x2)

def make_pdffigures2_gold_rects_getter(annotations_file):
    with open(annotations_file, 'r') as f:
        annotations = json.load(f)

    def _impl(file):
        doc_id, *_, page_no = os.path.splitext(os.path.basename(file))[0].split('-')
        page_no = int(page_no)

        page_tables = [fig for fig in annotations[doc_id]['figures']
                       if fig['figure_type'] == 'Table'
                       and fig['page'] == page_no]
        return dict(body=[pdffigures_format_rect_from_list(fig['region_bb']) for fig in page_tables],
                    surrounding_text=[pdffigures_format_rect_from_list(t['caption_bb']) for t in page_tables])
    return _impl

CS-150 dataset


In [14]:
segmentation_model = load_model('/notebook/models/full_model', custom_objects=model.__dict__)
nn_tabula_pf2_eval_result = calc_metrics_per_dir('/notebook/pdffigures2/evaluation/datasets/conference/page_images_color_only_with_tables/',
                                                 make_pdffigures2_gold_rects_getter('/notebook/pdffigures2/evaluation/datasets/conference/annotations.json'),
                                                 make_nn_tabula_rect_getter(segmentation_model,
                                                                            '/notebook/pdffigures2/evaluation/datasets/conference/pdfs/',
                                                                            page_no_one_based=True),
                                                 min_tp_dice=0.2,
                                                 extension='jpg')
nn_tabula_pf2_eval_result.transpose()[['mean', 'std']]


Out[14]:
mean std
macro surrounding_text gold_n 0.666667 0.516398
found_n 0.833333 1.602082
f1 0.233333 0.408248
prec 0.208333 0.400520
rec 0.333333 0.516398
fp 0.125000 0.306186
tp 0.208333 0.400520
fn 0.333333 0.516398
full_dice 0.065484 0.102478
tp_dice 0.152247 0.279583
body gold_n 0.666667 0.516398
found_n 1.500000 0.836660
f1 0.444444 0.501848
prec 0.416667 0.491596
rec 0.500000 0.547723
fp 0.583333 0.491596
tp 0.416667 0.491596
fn 0.166667 0.408248
full_dice 0.412880 0.443159
tp_dice 0.464098 0.510009
micro surrounding_text gold_n 3.400000 2.118700
found_n 3.600000 2.503331
f1 0.553935 0.296158
prec 0.547500 0.348897
rec 0.623969 0.336987
fp 0.352500 0.316327
tp 0.547500 0.348897
fn 0.383333 0.330572
full_dice 0.175890 0.064312
tp_dice 0.349353 0.194061
body gold_n 3.400000 2.118700
found_n 7.300000 4.029061
f1 0.509852 0.219189
prec 0.401515 0.202438
rec 0.797689 0.297804
fp 0.598485 0.202438
tp 0.401515 0.202438
fn 0.157143 0.254328
full_dice 0.389217 0.180099
tp_dice 0.916280 0.034531

S2 dataset


In [ ]:
# segmentation_model = load_model('/notebook/models/full_model', custom_objects=model.__dict__)
# nn_tabula_pf2_eval_result = calc_metrics_per_dir('/notebook/pdffigures2/evaluation/datasets/s2/page_images_color/',
#                                                  make_pdffigures2_gold_rects_getter('/notebook/pdffigures2/evaluation/datasets/s2/annotations.json'),
#                                                  make_nn_tabula_rect_getter(segmentation_model,
#                                                                             '/notebook/pdffigures2/evaluation/s2/conference/pdfs/',
#                                                                             page_no_one_based=True),
#                                                  extension='jpg')
# nn_tabula_pf2_eval_result.transpose()[['mean', 'std']]

Their dataset, their pipeline


In [ ]: