In [35]:
from __future__ import print_function
import re
import os
import codecs
import string

#Convert all pdfs
files = os.listdir('pdf')
for i,f in enumerate(files):

    pdf_path = os.path.join('pdf', f)
    txt_path = os.path.join('txt', f+'.txt')
    
    if not os.path.isfile(txt_path):
        #Layout preservation crucial to maintain clues about tabular data
        cmd = "pdftotext -layout %s %s" % (pdf_path, txt_path)
        print ('%d/%d %s' % (i, len(files), cmd))
        os.system(cmd)
    else:
        print ('skipping %s, already exists.' % (pdf_path, ))


skipping pdf/ER862677-ER674128-ER1075876.pdf, already exists.
skipping pdf/EP849915-EP657701-EP1059361.pdf, already exists.
skipping pdf/ER866175-ER676833-ER1078611.pdf, already exists.

In [147]:
#Existing Version
for file in os.listdir('txt'):
    
    print ("--------" + file + "--------")
    
    printline = 0
    linesleft = 0
    blanklines = 0
    
    topfound = 0
    headerline = 0 
    
    with codecs.open('txt/'+file, "r", "utf-8") as f:
        for i, line in enumerate(f):

            strippedline = line.upper().strip()

            if topfound == 0 and string.find(line,"       $") > 0:
                headerline = 1
                topfound = 1

            if 1 <= headerline <= 3:
                caption = "HEADER " + str(headerline)
                value = strippedline
                #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
                print (u"{:60s} {:10s}".format(caption, value))
                headerline = headerline + 1
                continue

            if strippedline == "SOURCES AND USES OF FUNDS" \
            or strippedline == "SOURCES AND USES OF FUNDS*" \
            or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS" \
            or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS*" \
            or strippedline == "SOURCES AND USES OF FUNDS(1)" \
            or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS(1)" \
            or strippedline == "PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS":
                printline = 1
                linesleft = 25

            if printline == 1:
                dollar_amount_regex = re.compile("[\$]{0,1}[\s]{0,6}[0-9,]{0,15}(\.[0-9]{1,2})$")
                dollar_amount_match = re.search(dollar_amount_regex,strippedline)
                if dollar_amount_match:
                    caption = strippedline[:dollar_amount_match.start(0)].strip()
                    value = strippedline[dollar_amount_match.start(0):].strip()
                    #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
                    print (u"{:60s} {:10s}".format(caption, value))
                if len(line.strip()) < 5 and linesleft < 10:
                    blanklines = blanklines + 1
                linesleft = linesleft - 1

            if linesleft == 0:
                printline = 0


--------ER862677-ER674128-ER1075876.pdf.txt--------
HEADER 1                                                     $95,885,000
HEADER 2                                                     CALIFORNIA MUNICIPAL FINANCE AUTHORITY
HEADER 3                                                     REVENUE BONDS, SERIES 2015-A
--------EP849915-EP657701-EP1059361.pdf.txt--------
HEADER 1                                                     $6,645,000
HEADER 2                                                     CITY OF PALM SPRINGS
HEADER 3                                                     LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS
--------ER866175-ER676833-ER1078611.pdf.txt--------
HEADER 1                                                     $19,560,000
HEADER 2                                                     RNR SCHOOL FINANCING AUTHORITY
HEADER 3                                                     COMMUNITY FACILITIES DISTRICT NO. 92-1
PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS                     $19,560,000.00
PLUS: NET ORIGINAL ISSUE PREMIUM                             2,550,554.30
PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS           367,663.99
TOTAL SOURCES                                                $22,302,178.29
DEPOSIT INTO ESCROW FUND (1)                                 $21,893,691.38
DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2)             408,486.91
TOTAL USES                                                   $22,302,178.29

In [ ]:
#Issues:
## Doesn't pick up caption in EP1059361 --> add USES OF FUNDS but then no SOURCES OF PAYMENTS
## Doesn't pick up line items in ER1075876 --> match sequences of .... to indicate tables as well, plus be more lenient with cents values

In [154]:
#New Version
for file in os.listdir('txt'):
    
    print ("--------" + file + "--------")
    
    printline = 0
    linesleft = 0
    blanklines = 0
    
    topfound = 0
    headerline = 0 
    
    with codecs.open('txt/'+file, "r", "utf-8") as f:
        for i, line in enumerate(f):

            
            strippedline = line.upper().strip()

            if topfound == 0 and string.find(line,"       $") > 0:
                headerline = 1
                topfound = 1

            if 1 <= headerline <= 3:
                caption = "HEADER " + str(headerline)
                value = strippedline
                #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
                print (u"{:60s} {:10s}".format(caption, value))
                headerline = headerline + 1
                continue

            if strippedline == "SOURCES AND USES OF FUNDS" \
            or strippedline == "SOURCES AND USES OF FUNDS*" \
            or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS" \
            or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS*" \
            or strippedline == "SOURCES AND USES OF FUNDS(1)" \
            or strippedline == "ESTIMATED SOURCES AND USES OF FUNDS(1)" \
            or strippedline == "PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS" \
            or strippedline == "ESTIMATED USES OF FUNDS": #New
                printline = 1
                linesleft = 25
                #print ("#### line:", i, "to", i+linesleft)

            if printline == 1:
                #Include a minimum of preceding dots or whitespace
                #Group 1 = preceding whitespace
                #Group 2 = Dollar value
                #Group 3 = $Cents value if existing
                dollar_amount_regex = ur"([\.]{4,}|[\s]{4,})[\s]*" + \
                                      ur"([\$]{0,1}[\s]{0,6}[0-9,]{2,15})(\.[0-9]{1,2})?$"
                dollar_amount_regex = re.compile(dollar_amount_regex)
                dollar_amount_match = re.search(dollar_amount_regex,strippedline)
                
                #Check whether we found something tabular and a dollar value
                if dollar_amount_match and dollar_amount_match.group(2):
                    caption = strippedline[:dollar_amount_match.start(1)].strip()
                    value = strippedline[dollar_amount_match.start(2):].strip()
                    #df = df.append({'file':file, 'caption':caption, 'value':value},ignore_index=True)
                    print (u"{:60s} {:10s}".format(caption, value))
                if len(line.strip()) < 5 and linesleft < 10:
                    blanklines = blanklines + 1
                linesleft = linesleft - 1

            if linesleft == 0:
                printline = 0


--------ER862677-ER674128-ER1075876.pdf.txt--------
HEADER 1                                                     $95,885,000
HEADER 2                                                     CALIFORNIA MUNICIPAL FINANCE AUTHORITY
HEADER 3                                                     REVENUE BONDS, SERIES 2015-A
PRINCIPAL AMOUNT                                             $ 95,885,000
BOND PREMIUM                                                 12,984,339
OTHER AVAILABLE FUNDS(1)                                     6,600,643 
TOTAL SOURCES                                                $115,469,982
DEPOSIT TO ACQUISITION FUND                                  $ 41,000,000
RETIREMENT OF WATER REVENUE ANTICIPATION NOTES(2)            14,000,000
DEPOSIT TO ESCROW FUND FOR REFUNDED 2008 BONDS               52,742,691
DISCHARGE OF STATE LOAN                                      7,096,550 
COSTS OF ISSUANCE(3)                                         630,741   
TOTAL USES                                                   $115,469,982
--------EP849915-EP657701-EP1059361.pdf.txt--------
HEADER 1                                                     $6,645,000
HEADER 2                                                     CITY OF PALM SPRINGS
HEADER 3                                                     LIMITED OBLIGATION REFUNDING IMPROVEMENT BONDS
TRANSFER TO ESCROW BANK                                      $6,086,693.08
RESERVE FUND (1)                                             274,331.25
COSTS OF ISSUANCE FUND (2)                                   152,404.72
TOTAL USES                                                   $6,513,429.05
--------ER866175-ER676833-ER1078611.pdf.txt--------
HEADER 1                                                     $19,560,000
HEADER 2                                                     RNR SCHOOL FINANCING AUTHORITY
HEADER 3                                                     COMMUNITY FACILITIES DISTRICT NO. 92-1
PRINCIPAL AMOUNT OF 2015 REFUNDING BONDS                     $19,560,000.00
PLUS: NET ORIGINAL ISSUE PREMIUM                             2,550,554.30
PLUS: TRANSFERRED MONEYS FROM FUNDS FOR 2006 BONDS           367,663.99
TOTAL SOURCES                                                $22,302,178.29
DEPOSIT INTO ESCROW FUND (1)                                 $21,893,691.38
DEPOSIT INTO 2015A COSTS OF ISSUANCE ACCOUNT (2)             408,486.91
TOTAL USES                                                   $22,302,178.29

In [150]:
#Some exploration
max_distance_below = 25
max_distance_above = 5
context_identifier = u"SOURCES AND USES OF FUNDS|SOURCES AND USES OF FUNDS*|ESTIMATED SOURCES AND USES OF FUNDS|" + \
                      "ESTIMATED SOURCES AND USES OF FUNDS*|SOURCES AND USES OF FUNDS(1)|" + \
                      "ESTIMATED SOURCES AND USES OF FUNDS(1)|PLAN OF FINANCE AND ESTIMATED SOURCES AND USES OF FUNDS"
context_identifier = context_identifier.split(u"|")

for file in os.listdir('txt'):
    
    print ("--------" + file + "--------")
    with codecs.open('txt/'+file, "r", "utf-8") as f:
        for i, line in enumerate(f):
            
            #Print Candidates
            id_found = reduce(lambda x,y: x or y, ( (id in line) for id in context_identifier ))
            if id_found:
                print(i, '-', line)


--------ER862677-ER674128-ER1075876.pdf.txt--------
27 - issuance of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”

229 - ESTIMATED SOURCES AND USES OF FUNDS ................................................................................... 7

370 - of the Bonds. See “PLAN OF FINANCE” and “ESTIMATED SOURCES AND USES OF FUNDS.”

653 -                                  ESTIMATED SOURCES AND USES OF FUNDS

--------EP849915-EP657701-EP1059361.pdf.txt--------
--------ER866175-ER676833-ER1078611.pdf.txt--------
223 - ESTIMATED SOURCES AND USES OF FUNDS .................................................................................. 13 

429 - Bonds. See “ESTIMATED SOURCES AND USES OF FUNDS.”

715 - “ESTIMATED SOURCES AND USES OF FUNDS.”

983 -                              ESTIMATED SOURCES AND USES OF FUNDS


In [ ]: