In [4]:
# Applies a table filter across all extracted tables from a project and calculates 
# the net underwriter discount, and
# the face value
# for a specific type of table (standard case: two column with $ denominated key-value pairs)

from __future__ import print_function

import os
import sys
import glob
import codecs
import json

import string

sys.path.insert(0, os.path.pardir)

from backend import *
from data_query import *

UPLOAD_FOLDER = os.path.join('..', 'static', 'ug')
FILTER_FOLDER = os.path.join('..', 'static', 'filters')
PROJECT = 'muni_bonds_bulk_2'
FILTER = 'funds'

path = os.path.join(UPLOAD_FOLDER, PROJECT, '*.tables.json')
table_files = glob.glob(path)

In [5]:
def clean_string(s):
    lc = s.encode('ascii', errors='ignore').lower()#.translate(remove_punctuation_map)
    return lc.translate(None, string.punctuation + '0123456789').strip()
    
from collections import Counter

table_counter = Counter()
tables_looked_at = 0
confidences = []
no_table_files = []
no_ud_tables = []
no_fv_tables = []
funny_tables = {}

salient_values = {}

# Get those line items sufficient for IRR estimation
# remark: improved query terms from TF analysis and annotation
irr_estimate_dict = {'face_value' : ['Principal Amount', 'Par Amount', 'Face Amount'], 
                     'premium' : 'Issue Premium',
                     'discount': 'Issue Discount',
                     'premium_or_discount' : 'Premium Discount', #will match line items that signify either at high confidence on the token level
                     'underwriter_discount' : 'Underwriter Discount',
                     'cost_of_issuance' : 'Costs of Issuance'}

filter_file = os.path.join(FILTER_FOLDER, FILTER+'.json')
with codecs.open(filter_file, "r", "utf-8", errors="replace") as file:
    _filter = json.load(file) 

print ("Processing with filter %s" % str(_filter))
print ("Procssing with value dictionary %s" % str(irr_estimate_dict))


Processing with filter {u'headers': {u'threshold': 0.35, u'terms': [u'USES OF FUNDS']}, u'name': u'Estimated use and sources of funds'}
Procssing with value dictionary {'underwriter_discount': 'Underwriter Discount', 'premium': 'Issue Premium', 'premium_or_discount': 'Premium Discount', 'discount': 'Issue Discount', 'face_value': ['Principal Amount', 'Par Amount', 'Face Amount'], 'cost_of_issuance': 'Costs of Issuance'}

In [6]:
#Get all tables
for i,f in enumerate(table_files):

    with codecs.open(f, 'r', 'utf-8') as file:
        tables = json.load(file)
        tables_looked_at += len(tables)
        
        filename = f.split(r'/')[-1].replace('.tables.json', '')
        
        filter_results = []
        for t in filter_tables(tables.values(), _filter):
            if len(filter_results) == 0 or t[0] >= max(r[0] for r in filter_results):
                filter_results.append(t)
        
        table_counter[len(filter_results)] += 1        
        if len(filter_results):

            #Only keep first one
            confidence, table, _, _ = max( sorted( filter_results, key = lambda t: t[1]['begin_line'] ), 
                                          key = lambda t: t[0])
            confidences.append(confidence)
            if len(table['captions']) != 2 or table['subtypes'][1] != 'dollar':
                funny_tables[filename] = table['begin_line']
            
            else:
                values = get_key_values(table, irr_estimate_dict, raw_cell=True)
                #invert line item if in brackets
                if values['premium_or_discount']:
                    r = values['premium_or_discount'][1]
                    if 'leftover' in r and '(' in r['leftover'][0] and ')' in r['leftover'][1]:
                        values['premium_or_discount'][0] = values['premium_or_discount'][0]
                
                #strip raw rows
                values = {k : (v[0] if v else None) for k,v in values.iteritems()}
                key = filename+'#'+str(table['begin_line'])
                
                if not values['face_value']: 
                    no_fv_tables.append(key)

                if not values['underwriter_discount']: 
                    no_ud_tables.append(key)
                    
                #maybe problem with ordering guarantee
                salient_values[key] = values.values()

        else:
            no_table_files.append(filename)
        
    if ( (i+1) % 100 ) == 0:
        print ("%i files and %i tables processed... with %i best matches" % \
               (i+1, tables_looked_at, len(confidences)))

        
results = {'high_confidence_candidates' : table_counter.most_common(),
           'tables_looked_at' : tables_looked_at,
           'tables_canonical' : len(confidences),
           'confidence_mean' : sum(confidences) / len(confidences),
           'confidences' : confidences, 
           'no_table_files' : no_table_files,
           'no_ud_tables' : no_ud_tables,
           'no_fv_tables' : no_fv_tables,
           'funny_tables' : funny_tables,
           'salient_values' : salient_values
          }

In [7]:
#Save intermediate results
with codecs.open("IRR_estimate.results.json", "w", "utf-8") as file:
    json.dump(results, file)

In [8]:
#Work from intermediate results
with codecs.open("IRR_estimate.results.json", "r", "utf-8") as file:
    results = json.load(file)

In [9]:
import xlwt

bold = xlwt.Style.easyxf("font: bold on")

def write_table(sheet, keys, values, row, c_offset = 0, column_style = bold):
    for j, k in enumerate(keys):
        sheet.write(row, c_offset+j, k, column_style)
    row += 1
    for v in values:
        for j, vv in enumerate(v):
            sheet.write(row, c_offset+j, vv)
        row +=1
    return row

In [10]:
url_prefix = "http://tabularazr.eastus.cloudapp.azure.com:7081/show/"+PROJECT+'/'

In [11]:
def to_xls_url(url, link = None):
    f = 'HYPERLINK("'+url+'"' + ('; "'+link+'")' if link else ')')
    return xlwt.Formula(f)

In [12]:
wkb = xlwt.Workbook(encoding='utf-8')
s_summary, s_funding_values, s_confidence, s_no_table, s_no_fv_tables, s_no_ud_tables, s_funny_tables = \
    (wkb.add_sheet(s) for s in ['summary', 'funding_values', 'confidence', 'no_table', 
                                'no_face_value_tables', 'no_underwriter_discount_tables', 'funny_tables'])

In [13]:
i = 0
s_summary.write(i,0, 'Filter used', bold)
s_summary.write(i,1, str(_filter))
i+=1
s_summary.write(i,0, 'Value extraction dictionary used', bold)
s_summary.write(i,1, str(irr_estimate_dict))
i+=2
s_summary.write(i,0, 'Distribution of good table matches per document', bold)
i+=1
i = write_table(s_summary, ['Nr. of Table Candidates', 'Nr. of Documents'], 
                results["high_confidence_candidates"], i)

i+=1
s_summary.write(i, 2, 'Total nr. of Table Candidates')
s_summary.write(i, 3, 'out of..')
i+=1
s_summary.write(i, 2, results['tables_canonical'])
s_summary.write(i, 3, results['tables_looked_at'])

i = write_table(s_confidence, ['Confidence in best Table found'], ([c] for c in results['confidences']), 0)
i = write_table(s_no_table, ['Files with no suitable table found', 'URL'], 
                ( ([c], to_xls_url(url_prefix+c)) for c in results['no_table_files'] ), 0)
i = write_table(s_no_ud_tables, ['Tables with no Underwriter Discount found', 'URL'], 
                ( ([c], to_xls_url(url_prefix+c)) for c in results['no_ud_tables'] ), 0)
i = write_table(s_no_fv_tables, ['Tables with no Face Value found', 'URL'], 
                ( ([c], to_xls_url(url_prefix+c)) for c in results['no_fv_tables'] ), 0)


s_funny_tables.write(0,4, "[as returned by filter but with <> 2 rows, and/or no $ value in the 2nd column]")
i = write_table(s_funny_tables, ['Funny Tables in File', 'Table ID',  'URL'], 
                ( ( f, t, to_xls_url(url_prefix+f+'#'+str(t)) ) for f, t in results['funny_tables'].iteritems() ), 0)

header_funding_values = ['Filename/Table', 'URL'] + irr_estimate_dict.keys()
i = write_table(s_funding_values, header_funding_values, 
               (( [k, to_xls_url(url_prefix+k)] + v) for k, v in results['salient_values'].iteritems()), 0)

In [14]:
wkb.save('IRR_estimate.results.xls')