In [35]:
from __future__ import print_function
import re
import os
import codecs
import string
#Convert all pdfs
files = os.listdir('pdf')
for i,f in enumerate(files):
pdf_path = os.path.join('pdf', f)
txt_path = os.path.join('txt', f+'.txt')
if not os.path.isfile(txt_path):
#Layout preservation crucial to maintain clues about tabular data
cmd = "pdftotext -layout %s %s" % (pdf_path, txt_path)
print ('%d/%d %s' % (i, len(files), cmd))
os.system(cmd)
else:
print ('skipping %s, already exists.' % (pdf_path, ))
In [147]:
#Existing Version
for file in os.listdir('txt'):
print ("--------" + file + "--------")
printline = 0
linesleft = 0
blanklines = 0
topfound = 0
headerline = 0
with codecs.open('txt/'+file, "r", "utf-8") as f:
for i, line in enumerate(f):
strippedline = line.upper().strip()
if topfound == 0 and string.find(line," $") > 0:
headerline = 1
topfound = 1
if 1 <= headerline <= 3:
caption = "HEADER " + str(headerline)
value = strippedline
#df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
print (u"{:60s} {:10s}".format(caption, value))
headerline = headerline + 1
continue
if strippedline == "SOURCES AND USES OF FUNDS" \
or strippedline == "SOURCES AND USES OF FUNDS*" \
or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS" \
or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS*" \
or strippedline == "SOURCES AND USES OF FUNDS(1)" \
or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS(1)" \
or strippedline == "PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS":
printline = 1
linesleft = 25
if printline == 1:
dollar_amount_regex = re.compile("[\$]{0,1}[\s]{0,6}[0-9,]{0,15}(\.[0-9]{1,2})$")
dollar_amount_match = re.search(dollar_amount_regex,strippedline)
if dollar_amount_match:
caption = strippedline[:dollar_amount_match.start(0)].strip()
value = strippedline[dollar_amount_match.start(0):].strip()
#df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
print (u"{:60s} {:10s}".format(caption, value))
if len(line.strip()) < 5 and linesleft < 10:
blanklines = blanklines + 1
linesleft = linesleft - 1
if linesleft == 0:
printline = 0
In [ ]:
#Issues:
## Doesn't pick up caption in EP1059361 --> add USES OF FUNDS but then no SOURCES OF PAYMENTS
## Doesn't pick up line items in ER1075876 --> match sequences of .... to indicate tables as well, plus be more lenient with cents values
In [154]:
#New Version
for file in os.listdir('txt'):
print ("--------" + file + "--------")
printline = 0
linesleft = 0
blanklines = 0
topfound = 0
headerline = 0
with codecs.open('txt/'+file, "r", "utf-8") as f:
for i, line in enumerate(f):
strippedline = line.upper().strip()
if topfound == 0 and string.find(line," $") > 0:
headerline = 1
topfound = 1
if 1 <= headerline <= 3:
caption = "HEADER " + str(headerline)
value = strippedline
#df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
print (u"{:60s} {:10s}".format(caption, value))
headerline = headerline + 1
continue
if strippedline == "SOURCES AND USES OF FUNDS" \
or strippedline == "SOURCES AND USES OF FUNDS*" \
or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS" \
or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS*" \
or strippedline == "SOURCES AND USES OF FUNDS(1)" \
or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS(1)" \
or strippedline == "PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS" \
or strippedline == "ESTIMATED USES OF FUNDS": #New
printline = 1
linesleft = 25
#print ("#### line:", i, "to", i+linesleft)
if printline == 1:
#Include a minimum of preceding dots or whitespace
#Group 1 = preceding whitespace
#Group 2 = Dollar value
#Group 3 = $Cents value if existing
dollar_amount_regex = ur"([\.]{4,}|[\s]{4,})[\s]*" + \
ur"([\$]{0,1}[\s]{0,6}[0-9,]{2,15})(\.[0-9]{1,2})?$"
dollar_amount_regex = re.compile(dollar_amount_regex)
dollar_amount_match = re.search(dollar_amount_regex,strippedline)
#Check whether we found something tabular and a dollar value
if dollar_amount_match and dollar_amount_match.group(2):
caption = strippedline[:dollar_amount_match.start(1)].strip()
value = strippedline[dollar_amount_match.start(2):].strip()
#df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
print (u"{:60s} {:10s}".format(caption, value))
if len(line.strip()) < 5 and linesleft < 10:
blanklines = blanklines + 1
linesleft = linesleft - 1
if linesleft == 0:
printline = 0
In [150]:
#Some exploration
max_distance_below = 25
max_distance_above = 5
context_identifier = u"SOURCES AND USES OF FUNDS|SOURCES AND USES OF FUNDS*|ESTIMATED SOURCES AND USES OF FUNDS|" + \
"ESTIMATED SOURCES AND USES OF FUNDS*|SOURCES AND USES OF FUNDS(1)|" + \
"ESTIMATED SOURCES AND USES OF FUNDS(1)|PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS"
context_identifier = context_identifier.split(u"|")
for file in os.listdir('txt'):
print ("--------" + file + "--------")
with codecs.open('txt/'+file, "r", "utf-8") as f:
for i, line in enumerate(f):
#Print Candidates
id_found = reduce(lambda x,y: x or y, ( (id in line) for id in context_identifier ))
if id_found:
print(i, '-', line)
In [ ]: