In [4]:
# Applies a table filter across all extracted tables from a project and calculates
# the net underwriter discount, and
# the face value
# for a specific type of table (standard case: two column with $ denominated key-value pairs)
from __future__ import print_function
import os
import sys
import glob
import codecs
import json
import string
sys.path.insert(0, os.path.pardir)
from backend import *
from data_query import *
UPLOAD_FOLDER = os.path.join('..', 'static', 'ug')
FILTER_FOLDER = os.path.join('..', 'static', 'filters')
PROJECT = 'muni_bonds_bulk_2'
FILTER = 'funds'
path = os.path.join(UPLOAD_FOLDER, PROJECT, '*.tables.json')
table_files = glob.glob(path)
In [5]:
def clean_string(s):
lc = s.encode('ascii', errors='ignore').lower()#.translate(remove_punctuation_map)
return lc.translate(None, string.punctuation + '0123456789').strip()
from collections import Counter
table_counter = Counter()
tables_looked_at = 0
confidences = []
no_table_files = []
no_ud_tables = []
no_fv_tables = []
funny_tables = {}
salient_values = {}
# Get those line items sufficient for IRR estimation
# remark: improved query terms from TF analysis and annotation
irr_estimate_dict = {'face_value' : ['Principal Amount', 'Par Amount', 'Face Amount'],
'premium' : 'Issue Premium',
'discount': 'Issue Discount',
'premium_or_discount' : 'Premium Discount', #will match line items that signify either at high confidence on the token level
'underwriter_discount' : 'Underwriter Discount',
'cost_of_issuance' : 'Costs of Issuance'}
filter_file = os.path.join(FILTER_FOLDER, FILTER+'.json')
with codecs.open(filter_file, "r", "utf-8", errors="replace") as file:
_filter = json.load(file)
print ("Processing with filter %s" % str(_filter))
print ("Procssing with value dictionary %s" % str(irr_estimate_dict))
In [6]:
#Get all tables
for i,f in enumerate(table_files):
with codecs.open(f, 'r', 'utf-8') as file:
tables = json.load(file)
tables_looked_at += len(tables)
filename = f.split(r'/')[-1].replace('.tables.json', '')
filter_results = []
for t in filter_tables(tables.values(), _filter):
if len(filter_results) == 0 or t[0] >= max(r[0] for r in filter_results):
filter_results.append(t)
table_counter[len(filter_results)] += 1
if len(filter_results):
#Only keep first one
confidence, table, _, _ = max( sorted( filter_results, key = lambda t: t[1]['begin_line'] ),
key = lambda t: t[0])
confidences.append(confidence)
if len(table['captions']) != 2 or table['subtypes'][1] != 'dollar':
funny_tables[filename] = table['begin_line']
else:
values = get_key_values(table, irr_estimate_dict, raw_cell=True)
#invert line item if in brackets
if values['premium_or_discount']:
r = values['premium_or_discount'][1]
if 'leftover' in r and '(' in r['leftover'][0] and ')' in r['leftover'][1]:
values['premium_or_discount'][0] = values['premium_or_discount'][0]
#strip raw rows
values = {k : (v[0] if v else None) for k,v in values.iteritems()}
key = filename+'#'+str(table['begin_line'])
if not values['face_value']:
no_fv_tables.append(key)
if not values['underwriter_discount']:
no_ud_tables.append(key)
#maybe problem with ordering guarantee
salient_values[key] = values.values()
else:
no_table_files.append(filename)
if ( (i+1) % 100 ) == 0:
print ("%i files and %i tables processed... with %i best matches" % \
(i+1, tables_looked_at, len(confidences)))
results = {'high_confidence_candidates' : table_counter.most_common(),
'tables_looked_at' : tables_looked_at,
'tables_canonical' : len(confidences),
'confidence_mean' : sum(confidences) / len(confidences),
'confidences' : confidences,
'no_table_files' : no_table_files,
'no_ud_tables' : no_ud_tables,
'no_fv_tables' : no_fv_tables,
'funny_tables' : funny_tables,
'salient_values' : salient_values
}
In [7]:
#Save intermediate results
with codecs.open("IRR_estimate.results.json", "w", "utf-8") as file:
json.dump(results, file)
In [8]:
#Work from intermediate results
with codecs.open("IRR_estimate.results.json", "r", "utf-8") as file:
results = json.load(file)
In [9]:
import xlwt
bold = xlwt.Style.easyxf("font: bold on")
def write_table(sheet, keys, values, row, c_offset = 0, column_style = bold):
for j, k in enumerate(keys):
sheet.write(row, c_offset+j, k, column_style)
row += 1
for v in values:
for j, vv in enumerate(v):
sheet.write(row, c_offset+j, vv)
row +=1
return row
In [10]:
url_prefix = "http://tabularazr.eastus.cloudapp.azure.com:7081/show/"+PROJECT+'/'
In [11]:
def to_xls_url(url, link = None):
f = 'HYPERLINK("'+url+'"' + ('; "'+link+'")' if link else ')')
return xlwt.Formula(f)
In [12]:
wkb = xlwt.Workbook(encoding='utf-8')
s_summary, s_funding_values, s_confidence, s_no_table, s_no_fv_tables, s_no_ud_tables, s_funny_tables = \
(wkb.add_sheet(s) for s in ['summary', 'funding_values', 'confidence', 'no_table',
'no_face_value_tables', 'no_underwriter_discount_tables', 'funny_tables'])
In [13]:
i = 0
s_summary.write(i,0, 'Filter used', bold)
s_summary.write(i,1, str(_filter))
i+=1
s_summary.write(i,0, 'Value extraction dictionary used', bold)
s_summary.write(i,1, str(irr_estimate_dict))
i+=2
s_summary.write(i,0, 'Distribution of good table matches per document', bold)
i+=1
i = write_table(s_summary, ['Nr. of Table Candidates', 'Nr. of Documents'],
results["high_confidence_candidates"], i)
i+=1
s_summary.write(i, 2, 'Total nr. of Table Candidates')
s_summary.write(i, 3, 'out of..')
i+=1
s_summary.write(i, 2, results['tables_canonical'])
s_summary.write(i, 3, results['tables_looked_at'])
i = write_table(s_confidence, ['Confidence in best Table found'], ([c] for c in results['confidences']), 0)
i = write_table(s_no_table, ['Files with no suitable table found', 'URL'],
( ([c], to_xls_url(url_prefix+c)) for c in results['no_table_files'] ), 0)
i = write_table(s_no_ud_tables, ['Tables with no Underwriter Discount found', 'URL'],
( ([c], to_xls_url(url_prefix+c)) for c in results['no_ud_tables'] ), 0)
i = write_table(s_no_fv_tables, ['Tables with no Face Value found', 'URL'],
( ([c], to_xls_url(url_prefix+c)) for c in results['no_fv_tables'] ), 0)
s_funny_tables.write(0,4, "[as returned by filter but with <> 2 rows, and/or no $ value in the 2nd column]")
i = write_table(s_funny_tables, ['Funny Tables in File', 'Table ID', 'URL'],
( ( f, t, to_xls_url(url_prefix+f+'#'+str(t)) ) for f, t in results['funny_tables'].iteritems() ), 0)
header_funding_values = ['Filename/Table', 'URL'] + irr_estimate_dict.keys()
i = write_table(s_funding_values, header_funding_values,
(( [k, to_xls_url(url_prefix+k)] + v) for k, v in results['salient_values'].iteritems()), 0)
In [14]:
wkb.save('IRR_estimate.results.xls')