In [2]:
!pip show pdf-table-extract


---
Name: pdf-table-extract
Version: 0.1
Location: /home/blannon/epd/lib/python2.7/site-packages/pdf_table_extract-0.1-py2.7.egg
Requires: 

In [1]:
!pwd


/home/blannon/src/pdf-table-extract

In [7]:
import sys
import json

import pandas as pd
import pdftableextract as pdf

In [2]:
reload(pdf)


Out[2]:
<module 'pdftableextract' from '/home/blannon/src/pdf-table-extract/src/pdftableextract/__init__.pyc'>

In [3]:
pages = ["1"]
cells = [pdf.process_page("example/example.pdf",p) for p in pages]

In [4]:
cells


Out[4]:
[[(0,
   0,
   6,
   1,
   1,
   'Unit Count and Square Footage First Quarter, Fiscal Year 2014 As of April 30, 2013'),
  (0, 1, 1, 1, 1, 'Format'),
  (1, 1, 1, 1, 1, 'New'),
  (2, 1, 1, 1, 1, 'Closed'),
  (3, 1, 1, 1, 1, 'Relocation/ Expansion/ * Conversion'),
  (4, 1, 1, 1, 1, 'Ending Square Footage'),
  (5, 1, 1, 1, 1, 'Total Locations'),
  (0, 2, 1, 1, 1, 'Walmart Discount Stores'),
  (1, 2, 1, 1, 1, '1'),
  (2, 2, 1, 1, 1, ''),
  (3, 2, 1, 1, 1, '(8)'),
  (4, 2, 1, 1, 1, '58,277,925'),
  (5, 2, 1, 1, 1, '554'),
  (0, 3, 1, 1, 1, 'Walmart Supercenters'),
  (1, 3, 1, 1, 1, '16'),
  (2, 3, 1, 1, 1, '-'),
  (3, 3, 1, 1, 1, '8'),
  (4, 3, 1, 1, 1, '574,058,937'),
  (5, 3, 1, 1, 1, '3,182'),
  (0, 4, 1, 1, 1, 'Neighborhood Markets'),
  (1, 4, 1, 1, 1, '17'),
  (2, 4, 1, 1, 1, '-'),
  (3, 4, 1, 1, 1, '-'),
  (4, 4, 1, 1, 1, '11,700,094'),
  (5, 4, 1, 1, 1, '284'),
  (0, 5, 1, 1, 1, 'Neighborhood Markets'),
  (1, 5, 1, 1, 1, '17'),
  (2, 5, 1, 1, 1, '-'),
  (3, 5, 1, 1, 1, '-'),
  (4, 5, 1, 1, 1, '10,948,346'),
  (5, 5, 1, 1, 1, '258'),
  (0, 6, 1, 1, 1, 'Amigo'),
  (1, 6, 1, 1, 1, '-'),
  (2, 6, 1, 1, 1, '-'),
  (3, 6, 1, 1, 1, '-'),
  (4, 6, 1, 1, 1, '672,148'),
  (5, 6, 1, 1, 1, '24'),
  (0, 7, 1, 1, 1, 'Supermercado'),
  (1, 7, 1, 1, 1, '-'),
  (2, 7, 1, 1, 1, '-'),
  (3, 7, 1, 1, 1, '-'),
  (4, 7, 1, 1, 1, '79,600'),
  (5, 7, 1, 1, 1, '2'),
  (0, 8, 1, 1, 1, 'Small Formats'),
  (1, 8, 1, 1, 1, '4'),
  (2, 8, 1, 1, 1, '-'),
  (3, 8, 1, 1, 1, '-'),
  (4, 8, 1, 1, 1, '294,308'),
  (5, 8, 1, 1, 1, '23'),
  (0, 9, 1, 1, 1, 'Marketside'),
  (1, 9, 1, 1, 1, '-'),
  (2, 9, 1, 1, 1, '-'),
  (3, 9, 1, 1, 1, '-'),
  (4, 9, 1, 1, 1, '-'),
  (5, 9, 1, 1, 1, '-'),
  (0, 10, 1, 1, 1, 'Super Ahorros'),
  (1, 10, 1, 1, 1, '-'),
  (2, 10, 1, 1, 1, '-'),
  (3, 10, 1, 1, 1, '-'),
  (4, 10, 1, 1, 1, '46,349'),
  (5, 10, 1, 1, 1, '3'),
  (0, 11, 1, 1, 1, 'Walmart Express'),
  (1, 11, 1, 1, 1, '4'),
  (2, 11, 1, 1, 1, '-'),
  (3, 11, 1, 1, 1, '-'),
  (4, 11, 1, 1, 1, '239,702'),
  (5, 11, 1, 1, 1, '17'),
  (0, 12, 1, 1, 1, 'Walmart on Campus'),
  (1, 12, 1, 1, 1, '-'),
  (2, 12, 1, 1, 1, '-'),
  (3, 12, 1, 1, 1, '-'),
  (4, 12, 1, 1, 1, '8,257'),
  (5, 12, 1, 1, 1, '3'),
  (0, 13, 1, 1, 1, 'Walmart U.S.'),
  (1, 13, 1, 1, 1, '38'),
  (2, 13, 1, 1, 1, '-'),
  (3, 13, 1, 1, 1, ''),
  (4, 13, 1, 1, 1, '644,331,264'),
  (5, 13, 1, 1, 1, '4,043'),
  (0, 14, 1, 1, 1, "Sam's Club"),
  (1, 14, 1, 1, 1, ''),
  (2, 14, 1, 1, 1, '-'),
  (3, 14, 1, 1, 1, '-'),
  (4, 14, 1, 1, 1, '82,669,348'),
  (5, 14, 1, 1, 1, '620'),
  (0, 15, 1, 1, 1, 'Total U.S.'),
  (1, 15, 1, 1, 1, '38'),
  (2, 15, 1, 1, 1, '-'),
  (3, 15, 1, 1, 1, '-'),
  (4, 15, 1, 1, 1, '727,000,612'),
  (5, 15, 1, 1, 1, '4,663'),
  (0, 16, 1, 1, 1, 'International'),
  (1, 16, 1, 1, 1, '50'),
  (2, 16, 1, 1, 1, '(3)'),
  (3, 16, 1, 1, 1, '(1)'),
  (4, 16, 1, 1, 1, '349,900,213'),
  (5, 16, 1, 1, 1, '6,194'),
  (0, 17, 6, 1, 1, ''),
  (0, 18, 1, 1, 1, 'Total Walmart'),
  (1, 18, 1, 1, 1, '88'),
  (2, 18, 1, 1, 1, '(3)'),
  (3, 18, 1, 1, 1, '(1)'),
  (4, 18, 1, 1, 1, '1,076,900,825'),
  (5, 18, 1, 1, 1, '10,857'),
  (0,
   19,
   6,
   1,
   1,
   "*Relocations/Expansion/Conversion represents net unit changes. Also includes units moved from Walmart International to the respective Walmart U.S. and Sam's Clubs Segments.")]]

In [6]:
pdf.output(cells, pages, cells_json_filename="cells.json")

In [8]:
jd = json.load(open("cells.json"))

In [10]:
jd.keys()


Out[10]:
[u'src', u'cells', u'colnames', u'name']

In [11]:
jd['colnames']


Out[11]:
[u'x', u'y', u'width', u'height', u'page', u'contents']

In [15]:
jd['cells'][0]


Out[15]:
[[0,
  0,
  6,
  1,
  1,
  u'Unit Count and Square Footage First Quarter, Fiscal Year 2014 As of April 30, 2013'],
 [0, 1, 1, 1, 1, u'Format'],
 [1, 1, 1, 1, 1, u'New'],
 [2, 1, 1, 1, 1, u'Closed'],
 [3, 1, 1, 1, 1, u'Relocation/ Expansion/ * Conversion'],
 [4, 1, 1, 1, 1, u'Ending Square Footage'],
 [5, 1, 1, 1, 1, u'Total Locations'],
 [0, 2, 1, 1, 1, u'Walmart Discount Stores'],
 [1, 2, 1, 1, 1, u'1'],
 [2, 2, 1, 1, 1, u''],
 [3, 2, 1, 1, 1, u'(8)'],
 [4, 2, 1, 1, 1, u'58,277,925'],
 [5, 2, 1, 1, 1, u'554'],
 [0, 3, 1, 1, 1, u'Walmart Supercenters'],
 [1, 3, 1, 1, 1, u'16'],
 [2, 3, 1, 1, 1, u'-'],
 [3, 3, 1, 1, 1, u'8'],
 [4, 3, 1, 1, 1, u'574,058,937'],
 [5, 3, 1, 1, 1, u'3,182'],
 [0, 4, 1, 1, 1, u'Neighborhood Markets'],
 [1, 4, 1, 1, 1, u'17'],
 [2, 4, 1, 1, 1, u'-'],
 [3, 4, 1, 1, 1, u'-'],
 [4, 4, 1, 1, 1, u'11,700,094'],
 [5, 4, 1, 1, 1, u'284'],
 [0, 5, 1, 1, 1, u'Neighborhood Markets'],
 [1, 5, 1, 1, 1, u'17'],
 [2, 5, 1, 1, 1, u'-'],
 [3, 5, 1, 1, 1, u'-'],
 [4, 5, 1, 1, 1, u'10,948,346'],
 [5, 5, 1, 1, 1, u'258'],
 [0, 6, 1, 1, 1, u'Amigo'],
 [1, 6, 1, 1, 1, u'-'],
 [2, 6, 1, 1, 1, u'-'],
 [3, 6, 1, 1, 1, u'-'],
 [4, 6, 1, 1, 1, u'672,148'],
 [5, 6, 1, 1, 1, u'24'],
 [0, 7, 1, 1, 1, u'Supermercado'],
 [1, 7, 1, 1, 1, u'-'],
 [2, 7, 1, 1, 1, u'-'],
 [3, 7, 1, 1, 1, u'-'],
 [4, 7, 1, 1, 1, u'79,600'],
 [5, 7, 1, 1, 1, u'2'],
 [0, 8, 1, 1, 1, u'Small Formats'],
 [1, 8, 1, 1, 1, u'4'],
 [2, 8, 1, 1, 1, u'-'],
 [3, 8, 1, 1, 1, u'-'],
 [4, 8, 1, 1, 1, u'294,308'],
 [5, 8, 1, 1, 1, u'23'],
 [0, 9, 1, 1, 1, u'Marketside'],
 [1, 9, 1, 1, 1, u'-'],
 [2, 9, 1, 1, 1, u'-'],
 [3, 9, 1, 1, 1, u'-'],
 [4, 9, 1, 1, 1, u'-'],
 [5, 9, 1, 1, 1, u'-'],
 [0, 10, 1, 1, 1, u'Super Ahorros'],
 [1, 10, 1, 1, 1, u'-'],
 [2, 10, 1, 1, 1, u'-'],
 [3, 10, 1, 1, 1, u'-'],
 [4, 10, 1, 1, 1, u'46,349'],
 [5, 10, 1, 1, 1, u'3'],
 [0, 11, 1, 1, 1, u'Walmart Express'],
 [1, 11, 1, 1, 1, u'4'],
 [2, 11, 1, 1, 1, u'-'],
 [3, 11, 1, 1, 1, u'-'],
 [4, 11, 1, 1, 1, u'239,702'],
 [5, 11, 1, 1, 1, u'17'],
 [0, 12, 1, 1, 1, u'Walmart on Campus'],
 [1, 12, 1, 1, 1, u'-'],
 [2, 12, 1, 1, 1, u'-'],
 [3, 12, 1, 1, 1, u'-'],
 [4, 12, 1, 1, 1, u'8,257'],
 [5, 12, 1, 1, 1, u'3'],
 [0, 13, 1, 1, 1, u'Walmart U.S.'],
 [1, 13, 1, 1, 1, u'38'],
 [2, 13, 1, 1, 1, u'-'],
 [3, 13, 1, 1, 1, u''],
 [4, 13, 1, 1, 1, u'644,331,264'],
 [5, 13, 1, 1, 1, u'4,043'],
 [0, 14, 1, 1, 1, u"Sam's Club"],
 [1, 14, 1, 1, 1, u''],
 [2, 14, 1, 1, 1, u'-'],
 [3, 14, 1, 1, 1, u'-'],
 [4, 14, 1, 1, 1, u'82,669,348'],
 [5, 14, 1, 1, 1, u'620'],
 [0, 15, 1, 1, 1, u'Total U.S.'],
 [1, 15, 1, 1, 1, u'38'],
 [2, 15, 1, 1, 1, u'-'],
 [3, 15, 1, 1, 1, u'-'],
 [4, 15, 1, 1, 1, u'727,000,612'],
 [5, 15, 1, 1, 1, u'4,663'],
 [0, 16, 1, 1, 1, u'International'],
 [1, 16, 1, 1, 1, u'50'],
 [2, 16, 1, 1, 1, u'(3)'],
 [3, 16, 1, 1, 1, u'(1)'],
 [4, 16, 1, 1, 1, u'349,900,213'],
 [5, 16, 1, 1, 1, u'6,194'],
 [0, 17, 6, 1, 1, u''],
 [0, 18, 1, 1, 1, u'Total Walmart'],
 [1, 18, 1, 1, 1, u'88'],
 [2, 18, 1, 1, 1, u'(3)'],
 [3, 18, 1, 1, 1, u'(1)'],
 [4, 18, 1, 1, 1, u'1,076,900,825'],
 [5, 18, 1, 1, 1, u'10,857'],
 [0,
  19,
  6,
  1,
  1,
  u"*Relocations/Expansion/Conversion represents net unit changes. Also includes units moved from Walmart International to the respective Walmart U.S. and Sam's Clubs Segments."]]

In [19]:
from collections import defaultdict
from operator import itemgetter

In [26]:
col_num = max(map(itemgetter(0), jd['cells'][0])) + 1

In [27]:
col_num


Out[27]:
6

In [28]:
dd = defaultdict(lambda: [""] * col_num)

In [29]:
for row in jd['cells'][0]:
    row_dict = dict(zip(jd['colnames'],row))
    dd[row_dict['y']][row_dict['x']] = row_dict['contents']

In [32]:
print json.dumps(dd,indent=2)


{
  "0": [
    "Unit Count and Square Footage First Quarter, Fiscal Year 2014 As of April 30, 2013", 
    "", 
    "", 
    "", 
    "", 
    ""
  ], 
  "1": [
    "Format", 
    "New", 
    "Closed", 
    "Relocation/ Expansion/ * Conversion", 
    "Ending Square Footage", 
    "Total Locations"
  ], 
  "2": [
    "Walmart Discount Stores", 
    "1", 
    "", 
    "(8)", 
    "58,277,925", 
    "554"
  ], 
  "3": [
    "Walmart Supercenters", 
    "16", 
    "-", 
    "8", 
    "574,058,937", 
    "3,182"
  ], 
  "4": [
    "Neighborhood Markets", 
    "17", 
    "-", 
    "-", 
    "11,700,094", 
    "284"
  ], 
  "5": [
    "Neighborhood Markets", 
    "17", 
    "-", 
    "-", 
    "10,948,346", 
    "258"
  ], 
  "6": [
    "Amigo", 
    "-", 
    "-", 
    "-", 
    "672,148", 
    "24"
  ], 
  "7": [
    "Supermercado", 
    "-", 
    "-", 
    "-", 
    "79,600", 
    "2"
  ], 
  "8": [
    "Small Formats", 
    "4", 
    "-", 
    "-", 
    "294,308", 
    "23"
  ], 
  "9": [
    "Marketside", 
    "-", 
    "-", 
    "-", 
    "-", 
    "-"
  ], 
  "10": [
    "Super Ahorros", 
    "-", 
    "-", 
    "-", 
    "46,349", 
    "3"
  ], 
  "11": [
    "Walmart Express", 
    "4", 
    "-", 
    "-", 
    "239,702", 
    "17"
  ], 
  "12": [
    "Walmart on Campus", 
    "-", 
    "-", 
    "-", 
    "8,257", 
    "3"
  ], 
  "13": [
    "Walmart U.S.", 
    "38", 
    "-", 
    "", 
    "644,331,264", 
    "4,043"
  ], 
  "14": [
    "Sam's Club", 
    "", 
    "-", 
    "-", 
    "82,669,348", 
    "620"
  ], 
  "15": [
    "Total U.S.", 
    "38", 
    "-", 
    "-", 
    "727,000,612", 
    "4,663"
  ], 
  "16": [
    "International", 
    "50", 
    "(3)", 
    "(1)", 
    "349,900,213", 
    "6,194"
  ], 
  "17": [
    "", 
    "", 
    "", 
    "", 
    "", 
    ""
  ], 
  "18": [
    "Total Walmart", 
    "88", 
    "(3)", 
    "(1)", 
    "1,076,900,825", 
    "10,857"
  ], 
  "19": [
    "*Relocations/Expansion/Conversion represents net unit changes. Also includes units moved from Walmart International to the respective Walmart U.S. and Sam's Clubs Segments.", 
    "", 
    "", 
    "", 
    "", 
    ""
  ]
}

In [38]:
df = pd.DataFrame.from_dict(dd, orient='index')

In [39]:
df.head()


Out[39]:
0 1 2 3 4 5
0 Unit Count and Square Footage First Quarter, F...
1 Format New Closed Relocation/ Expansion/ * Conversion Ending Square Footage Total Locations
2 Walmart Discount Stores 1 (8) 58,277,925 554
3 Walmart Supercenters 16 - 8 574,058,937 3,182
4 Neighborhood Markets 17 - - 11,700,094 284

In [ ]: