This example uses pdfplumber's visual debugging and text-extraction features to parse a fixed-width table embedded in a PDF. Thanks to Ron Campbell for the sample PDF.
In [1]:
import pdfplumber
import re
from collections import OrderedDict
In [2]:
pdf = pdfplumber.open("../pdfs/san-jose-pd-firearm-sample.pdf")
In [3]:
p0 = pdf.pages[0]
In [4]:
im = p0.to_image()
im
Out[4]:
Below, we draw rectangles around each of the char objects that pdfplumber detected. By doing so, we can see that every line of the main part of the report is the same width, and that there are space (" ") characters padding out each field. That means we can parse those lines a lot like we'd parse a standard fixed-width data file.
In [5]:
im.reset().draw_rects(p0.chars)
Out[5]:
In [6]:
text = p0.extract_text()
print(text)
In [7]:
core_pat = re.compile(r"LOCATION[\-\s]+(.*)\n\s+Flags = e", re.DOTALL)
In [8]:
core = re.search(core_pat, text).group(1)
In [9]:
print(core)
In [10]:
lines = core.split("\n")
line_groups = list(zip(lines[::2], lines[1::2]))
print(line_groups[0])
In [11]:
def parse_row(first_line, second_line):
return OrderedDict([
("type", first_line[:20].strip()),
("item", first_line[21:41].strip()),
("make", first_line[44:89].strip()),
("model", first_line[90:105].strip()),
("calibre", first_line[106:111].strip()),
("status", first_line[112:120].strip()),
("flags", first_line[124:129].strip()),
("serial_number", second_line[0:13].strip()),
("report_tag_number", second_line[21:41].strip()),
("case_file_number", second_line[44:64].strip()),
("storage_location", second_line[68:91].strip())
])
In [12]:
parsed = [ parse_row(first_line, second_line)
for first_line, second_line in line_groups ]
In [13]:
parsed[:2]
Out[13]:
In [14]:
import pandas as pd
columns = list(parsed[0].keys())
pd.DataFrame(parsed)[columns]
Out[14]: