In [2]:
!pip show pdf-table-extract
---
Name: pdf-table-extract
Version: 0.1
Location: /home/blannon/epd/lib/python2.7/site-packages/pdf_table_extract-0.1-py2.7.egg
Requires:
In [1]:
!pwd
/home/blannon/src/pdf-table-extract
In [7]:
import sys
import json
import pandas as pd
import pdftableextract as pdf
In [2]:
reload(pdf)
Out[2]:
<module 'pdftableextract' from '/home/blannon/src/pdf-table-extract/src/pdftableextract/__init__.pyc'>
In [3]:
pages = ["1"]
cells = [pdf.process_page("example/example.pdf",p) for p in pages]
In [4]:
cells
Out[4]:
[[(0,
0,
6,
1,
1,
'Unit Count and Square Footage First Quarter, Fiscal Year 2014 As of April 30, 2013'),
(0, 1, 1, 1, 1, 'Format'),
(1, 1, 1, 1, 1, 'New'),
(2, 1, 1, 1, 1, 'Closed'),
(3, 1, 1, 1, 1, 'Relocation/ Expansion/ * Conversion'),
(4, 1, 1, 1, 1, 'Ending Square Footage'),
(5, 1, 1, 1, 1, 'Total Locations'),
(0, 2, 1, 1, 1, 'Walmart Discount Stores'),
(1, 2, 1, 1, 1, '1'),
(2, 2, 1, 1, 1, ''),
(3, 2, 1, 1, 1, '(8)'),
(4, 2, 1, 1, 1, '58,277,925'),
(5, 2, 1, 1, 1, '554'),
(0, 3, 1, 1, 1, 'Walmart Supercenters'),
(1, 3, 1, 1, 1, '16'),
(2, 3, 1, 1, 1, '-'),
(3, 3, 1, 1, 1, '8'),
(4, 3, 1, 1, 1, '574,058,937'),
(5, 3, 1, 1, 1, '3,182'),
(0, 4, 1, 1, 1, 'Neighborhood Markets'),
(1, 4, 1, 1, 1, '17'),
(2, 4, 1, 1, 1, '-'),
(3, 4, 1, 1, 1, '-'),
(4, 4, 1, 1, 1, '11,700,094'),
(5, 4, 1, 1, 1, '284'),
(0, 5, 1, 1, 1, 'Neighborhood Markets'),
(1, 5, 1, 1, 1, '17'),
(2, 5, 1, 1, 1, '-'),
(3, 5, 1, 1, 1, '-'),
(4, 5, 1, 1, 1, '10,948,346'),
(5, 5, 1, 1, 1, '258'),
(0, 6, 1, 1, 1, 'Amigo'),
(1, 6, 1, 1, 1, '-'),
(2, 6, 1, 1, 1, '-'),
(3, 6, 1, 1, 1, '-'),
(4, 6, 1, 1, 1, '672,148'),
(5, 6, 1, 1, 1, '24'),
(0, 7, 1, 1, 1, 'Supermercado'),
(1, 7, 1, 1, 1, '-'),
(2, 7, 1, 1, 1, '-'),
(3, 7, 1, 1, 1, '-'),
(4, 7, 1, 1, 1, '79,600'),
(5, 7, 1, 1, 1, '2'),
(0, 8, 1, 1, 1, 'Small Formats'),
(1, 8, 1, 1, 1, '4'),
(2, 8, 1, 1, 1, '-'),
(3, 8, 1, 1, 1, '-'),
(4, 8, 1, 1, 1, '294,308'),
(5, 8, 1, 1, 1, '23'),
(0, 9, 1, 1, 1, 'Marketside'),
(1, 9, 1, 1, 1, '-'),
(2, 9, 1, 1, 1, '-'),
(3, 9, 1, 1, 1, '-'),
(4, 9, 1, 1, 1, '-'),
(5, 9, 1, 1, 1, '-'),
(0, 10, 1, 1, 1, 'Super Ahorros'),
(1, 10, 1, 1, 1, '-'),
(2, 10, 1, 1, 1, '-'),
(3, 10, 1, 1, 1, '-'),
(4, 10, 1, 1, 1, '46,349'),
(5, 10, 1, 1, 1, '3'),
(0, 11, 1, 1, 1, 'Walmart Express'),
(1, 11, 1, 1, 1, '4'),
(2, 11, 1, 1, 1, '-'),
(3, 11, 1, 1, 1, '-'),
(4, 11, 1, 1, 1, '239,702'),
(5, 11, 1, 1, 1, '17'),
(0, 12, 1, 1, 1, 'Walmart on Campus'),
(1, 12, 1, 1, 1, '-'),
(2, 12, 1, 1, 1, '-'),
(3, 12, 1, 1, 1, '-'),
(4, 12, 1, 1, 1, '8,257'),
(5, 12, 1, 1, 1, '3'),
(0, 13, 1, 1, 1, 'Walmart U.S.'),
(1, 13, 1, 1, 1, '38'),
(2, 13, 1, 1, 1, '-'),
(3, 13, 1, 1, 1, ''),
(4, 13, 1, 1, 1, '644,331,264'),
(5, 13, 1, 1, 1, '4,043'),
(0, 14, 1, 1, 1, "Sam's Club"),
(1, 14, 1, 1, 1, ''),
(2, 14, 1, 1, 1, '-'),
(3, 14, 1, 1, 1, '-'),
(4, 14, 1, 1, 1, '82,669,348'),
(5, 14, 1, 1, 1, '620'),
(0, 15, 1, 1, 1, 'Total U.S.'),
(1, 15, 1, 1, 1, '38'),
(2, 15, 1, 1, 1, '-'),
(3, 15, 1, 1, 1, '-'),
(4, 15, 1, 1, 1, '727,000,612'),
(5, 15, 1, 1, 1, '4,663'),
(0, 16, 1, 1, 1, 'International'),
(1, 16, 1, 1, 1, '50'),
(2, 16, 1, 1, 1, '(3)'),
(3, 16, 1, 1, 1, '(1)'),
(4, 16, 1, 1, 1, '349,900,213'),
(5, 16, 1, 1, 1, '6,194'),
(0, 17, 6, 1, 1, ''),
(0, 18, 1, 1, 1, 'Total Walmart'),
(1, 18, 1, 1, 1, '88'),
(2, 18, 1, 1, 1, '(3)'),
(3, 18, 1, 1, 1, '(1)'),
(4, 18, 1, 1, 1, '1,076,900,825'),
(5, 18, 1, 1, 1, '10,857'),
(0,
19,
6,
1,
1,
"*Relocations/Expansion/Conversion represents net unit changes. Also includes units moved from Walmart International to the respective Walmart U.S. and Sam's Clubs Segments.")]]
In [6]:
pdf.output(cells, pages, cells_json_filename="cells.json")
In [8]:
jd = json.load(open("cells.json"))
In [10]:
jd.keys()
Out[10]:
[u'src', u'cells', u'colnames', u'name']
In [11]:
jd['colnames']
Out[11]:
[u'x', u'y', u'width', u'height', u'page', u'contents']
In [15]:
jd['cells'][0]
Out[15]:
[[0,
0,
6,
1,
1,
u'Unit Count and Square Footage First Quarter, Fiscal Year 2014 As of April 30, 2013'],
[0, 1, 1, 1, 1, u'Format'],
[1, 1, 1, 1, 1, u'New'],
[2, 1, 1, 1, 1, u'Closed'],
[3, 1, 1, 1, 1, u'Relocation/ Expansion/ * Conversion'],
[4, 1, 1, 1, 1, u'Ending Square Footage'],
[5, 1, 1, 1, 1, u'Total Locations'],
[0, 2, 1, 1, 1, u'Walmart Discount Stores'],
[1, 2, 1, 1, 1, u'1'],
[2, 2, 1, 1, 1, u''],
[3, 2, 1, 1, 1, u'(8)'],
[4, 2, 1, 1, 1, u'58,277,925'],
[5, 2, 1, 1, 1, u'554'],
[0, 3, 1, 1, 1, u'Walmart Supercenters'],
[1, 3, 1, 1, 1, u'16'],
[2, 3, 1, 1, 1, u'-'],
[3, 3, 1, 1, 1, u'8'],
[4, 3, 1, 1, 1, u'574,058,937'],
[5, 3, 1, 1, 1, u'3,182'],
[0, 4, 1, 1, 1, u'Neighborhood Markets'],
[1, 4, 1, 1, 1, u'17'],
[2, 4, 1, 1, 1, u'-'],
[3, 4, 1, 1, 1, u'-'],
[4, 4, 1, 1, 1, u'11,700,094'],
[5, 4, 1, 1, 1, u'284'],
[0, 5, 1, 1, 1, u'Neighborhood Markets'],
[1, 5, 1, 1, 1, u'17'],
[2, 5, 1, 1, 1, u'-'],
[3, 5, 1, 1, 1, u'-'],
[4, 5, 1, 1, 1, u'10,948,346'],
[5, 5, 1, 1, 1, u'258'],
[0, 6, 1, 1, 1, u'Amigo'],
[1, 6, 1, 1, 1, u'-'],
[2, 6, 1, 1, 1, u'-'],
[3, 6, 1, 1, 1, u'-'],
[4, 6, 1, 1, 1, u'672,148'],
[5, 6, 1, 1, 1, u'24'],
[0, 7, 1, 1, 1, u'Supermercado'],
[1, 7, 1, 1, 1, u'-'],
[2, 7, 1, 1, 1, u'-'],
[3, 7, 1, 1, 1, u'-'],
[4, 7, 1, 1, 1, u'79,600'],
[5, 7, 1, 1, 1, u'2'],
[0, 8, 1, 1, 1, u'Small Formats'],
[1, 8, 1, 1, 1, u'4'],
[2, 8, 1, 1, 1, u'-'],
[3, 8, 1, 1, 1, u'-'],
[4, 8, 1, 1, 1, u'294,308'],
[5, 8, 1, 1, 1, u'23'],
[0, 9, 1, 1, 1, u'Marketside'],
[1, 9, 1, 1, 1, u'-'],
[2, 9, 1, 1, 1, u'-'],
[3, 9, 1, 1, 1, u'-'],
[4, 9, 1, 1, 1, u'-'],
[5, 9, 1, 1, 1, u'-'],
[0, 10, 1, 1, 1, u'Super Ahorros'],
[1, 10, 1, 1, 1, u'-'],
[2, 10, 1, 1, 1, u'-'],
[3, 10, 1, 1, 1, u'-'],
[4, 10, 1, 1, 1, u'46,349'],
[5, 10, 1, 1, 1, u'3'],
[0, 11, 1, 1, 1, u'Walmart Express'],
[1, 11, 1, 1, 1, u'4'],
[2, 11, 1, 1, 1, u'-'],
[3, 11, 1, 1, 1, u'-'],
[4, 11, 1, 1, 1, u'239,702'],
[5, 11, 1, 1, 1, u'17'],
[0, 12, 1, 1, 1, u'Walmart on Campus'],
[1, 12, 1, 1, 1, u'-'],
[2, 12, 1, 1, 1, u'-'],
[3, 12, 1, 1, 1, u'-'],
[4, 12, 1, 1, 1, u'8,257'],
[5, 12, 1, 1, 1, u'3'],
[0, 13, 1, 1, 1, u'Walmart U.S.'],
[1, 13, 1, 1, 1, u'38'],
[2, 13, 1, 1, 1, u'-'],
[3, 13, 1, 1, 1, u''],
[4, 13, 1, 1, 1, u'644,331,264'],
[5, 13, 1, 1, 1, u'4,043'],
[0, 14, 1, 1, 1, u"Sam's Club"],
[1, 14, 1, 1, 1, u''],
[2, 14, 1, 1, 1, u'-'],
[3, 14, 1, 1, 1, u'-'],
[4, 14, 1, 1, 1, u'82,669,348'],
[5, 14, 1, 1, 1, u'620'],
[0, 15, 1, 1, 1, u'Total U.S.'],
[1, 15, 1, 1, 1, u'38'],
[2, 15, 1, 1, 1, u'-'],
[3, 15, 1, 1, 1, u'-'],
[4, 15, 1, 1, 1, u'727,000,612'],
[5, 15, 1, 1, 1, u'4,663'],
[0, 16, 1, 1, 1, u'International'],
[1, 16, 1, 1, 1, u'50'],
[2, 16, 1, 1, 1, u'(3)'],
[3, 16, 1, 1, 1, u'(1)'],
[4, 16, 1, 1, 1, u'349,900,213'],
[5, 16, 1, 1, 1, u'6,194'],
[0, 17, 6, 1, 1, u''],
[0, 18, 1, 1, 1, u'Total Walmart'],
[1, 18, 1, 1, 1, u'88'],
[2, 18, 1, 1, 1, u'(3)'],
[3, 18, 1, 1, 1, u'(1)'],
[4, 18, 1, 1, 1, u'1,076,900,825'],
[5, 18, 1, 1, 1, u'10,857'],
[0,
19,
6,
1,
1,
u"*Relocations/Expansion/Conversion represents net unit changes. Also includes units moved from Walmart International to the respective Walmart U.S. and Sam's Clubs Segments."]]
In [19]:
from collections import defaultdict
from operator import itemgetter
In [26]:
col_num = max(map(itemgetter(0), jd['cells'][0])) + 1
In [27]:
col_num
Out[27]:
6
In [28]:
dd = defaultdict(lambda: [""] * col_num)
In [29]:
for row in jd['cells'][0]:
row_dict = dict(zip(jd['colnames'],row))
dd[row_dict['y']][row_dict['x']] = row_dict['contents']
In [32]:
print json.dumps(dd,indent=2)
{
"0": [
"Unit Count and Square Footage First Quarter, Fiscal Year 2014 As of April 30, 2013",
"",
"",
"",
"",
""
],
"1": [
"Format",
"New",
"Closed",
"Relocation/ Expansion/ * Conversion",
"Ending Square Footage",
"Total Locations"
],
"2": [
"Walmart Discount Stores",
"1",
"",
"(8)",
"58,277,925",
"554"
],
"3": [
"Walmart Supercenters",
"16",
"-",
"8",
"574,058,937",
"3,182"
],
"4": [
"Neighborhood Markets",
"17",
"-",
"-",
"11,700,094",
"284"
],
"5": [
"Neighborhood Markets",
"17",
"-",
"-",
"10,948,346",
"258"
],
"6": [
"Amigo",
"-",
"-",
"-",
"672,148",
"24"
],
"7": [
"Supermercado",
"-",
"-",
"-",
"79,600",
"2"
],
"8": [
"Small Formats",
"4",
"-",
"-",
"294,308",
"23"
],
"9": [
"Marketside",
"-",
"-",
"-",
"-",
"-"
],
"10": [
"Super Ahorros",
"-",
"-",
"-",
"46,349",
"3"
],
"11": [
"Walmart Express",
"4",
"-",
"-",
"239,702",
"17"
],
"12": [
"Walmart on Campus",
"-",
"-",
"-",
"8,257",
"3"
],
"13": [
"Walmart U.S.",
"38",
"-",
"",
"644,331,264",
"4,043"
],
"14": [
"Sam's Club",
"",
"-",
"-",
"82,669,348",
"620"
],
"15": [
"Total U.S.",
"38",
"-",
"-",
"727,000,612",
"4,663"
],
"16": [
"International",
"50",
"(3)",
"(1)",
"349,900,213",
"6,194"
],
"17": [
"",
"",
"",
"",
"",
""
],
"18": [
"Total Walmart",
"88",
"(3)",
"(1)",
"1,076,900,825",
"10,857"
],
"19": [
"*Relocations/Expansion/Conversion represents net unit changes. Also includes units moved from Walmart International to the respective Walmart U.S. and Sam's Clubs Segments.",
"",
"",
"",
"",
""
]
}
In [38]:
df = pd.DataFrame.from_dict(dd, orient='index')
In [39]:
df.head()
Out[39]:
0
1
2
3
4
5
0
Unit Count and Square Footage First Quarter, F...
1
Format
New
Closed
Relocation/ Expansion/ * Conversion
Ending Square Footage
Total Locations
2
Walmart Discount Stores
1
(8)
58,277,925
554
3
Walmart Supercenters
16
-
8
574,058,937
3,182
4
Neighborhood Markets
17
-
-
11,700,094
284
In [ ]:
Content source: pdfliberation/pdf_table_extraction
Similar notebooks: