In [ ]:
!pwd

In [7]:
from __future__ import print_function
import re
import os
import codecs
import string

PDF_SUBFOLDER = 'pdf'
TXT_SUBFOLDER = 'txt'

def create_path(path):
    try: 
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise    

#Convert all pdfs
files = os.listdir(PDF_SUBFOLDER)
create_path(os.path.join(TXT_SUBFOLDER))

for i,f in enumerate(files):

    pdf_path = os.path.join(PDF_SUBFOLDER, f)
    txt_path = os.path.join(TXT_SUBFOLDER, f+'.txt')
    
    if not os.path.isfile(txt_path):
        #Layout preservation crucial to maintain clues about tabular data
        cmd = "pdftotext -enc UTF-8 -layout %s %s" % (pdf_path, txt_path)
        print ('%d/%d %s' % (i, len(files), cmd))
        os.system(cmd)
    else:
        print ('skipping %s, already exists.' % (pdf_path, ))


0/4 pdftotext -enc UTF-8 -layout pdf/EA716610-EA562590-EA958701.pdf txt/EA716610-EA562590-EA958701.pdf.txt
1/4 pdftotext -enc UTF-8 -layout pdf/EP753324-ER508056-ER910760.pdf txt/EP753324-ER508056-ER910760.pdf.txt
2/4 pdftotext -enc UTF-8 -layout pdf/ER544111-ER421289-ER823264.pdf txt/ER544111-ER421289-ER823264.pdf.txt
3/4 pdftotext -enc UTF-8 -layout pdf/ER588705-ER457598-ER860368.pdf txt/ER588705-ER457598-ER860368.pdf.txt

In [ ]: