In [ ]:
!pwd
In [7]:
from __future__ import print_function
import re
import os
import codecs
import string
PDF_SUBFOLDER = 'pdf'
TXT_SUBFOLDER = 'txt'
def create_path(path):
try:
os.makedirs(path)
except OSError:
if not os.path.isdir(path):
raise
#Convert all pdfs
files = os.listdir(PDF_SUBFOLDER)
create_path(os.path.join(TXT_SUBFOLDER))
for i,f in enumerate(files):
pdf_path = os.path.join(PDF_SUBFOLDER, f)
txt_path = os.path.join(TXT_SUBFOLDER, f+'.txt')
if not os.path.isfile(txt_path):
#Layout preservation crucial to maintain clues about tabular data
cmd = "pdftotext -enc UTF-8 -layout %s %s" % (pdf_path, txt_path)
print ('%d/%d %s' % (i, len(files), cmd))
os.system(cmd)
else:
print ('skipping %s, already exists.' % (pdf_path, ))
In [ ]: