In [1]:
#TabulaRazr - specific to calculate  - TABLE Parser
#Infers a table with arbitrary number of columns from reoccuring patterns in text lines
#(c) Alexander Hirner 2016, no redistribution without permission
#Contributions: ____ (refactoring), UI styling (), ....


#Main assumptions Table identificatin:
#1) each row is either in one line or not a row at all
#2) each column features at least one number (=dollar amount)
#2a) each column features at least one date-like string [for time-series only]
#3) a table exists if rows are in narrow consecutive order and share similarities --> scoring algo [DONE] 
#4) each column is separated by more than x consecutive whitespace indicators (e.g. '  ' or '..')

#Feature List Todo:
#1) Acknowledge footnotes / make lower meta-data available
#2) make delimiter length smartly dependent on number of columns (possible iterative approach)
#3) improve captioning: expand non canonical values in tables [DONE] .. but not to the extent how types match up  --> use this to further
## delineate between caption and headers
#4) UI: parameterize extraction on the show page on the fly
#5) deeper type inference on token level: type complex [DONE], subtype header (centered, capitalized), 
## subtype page nr., type free flow [DONE, need paragraph]
#5a) re
#6) Respect negative values with potential '-' for numerical values
#7)
#8) classify tables with keywords (Muni Bonds) and unsupervised clustering (Hackathon)
#9) Restructure folder and URI around MD5 hash (http://stackoverflow.com/questions/24570066/calculate-md5-from-werkzeug-datastructures-filestorage-without-saving-the-object)

In [2]:
from backend import *
import os

In [3]:
import json
from flask import Flask, request, redirect, url_for, send_from_directory
from werkzeug import secure_filename
from flask import jsonify, render_template, make_response

import matplotlib.pyplot as plt

In [4]:
TITLE = "TabulaRazr (XIRR for muni_bonds)"

scripts = []
css = [
    "./bower_components/bootstrap/dist/css/bootstrap.min.css",
    "./css/main.css",
    "./css/style.css"
]


UPLOAD_FOLDER = './static/ug'
ALLOWED_EXTENSIONS = set(['txt', 'pdf'])

TITLE = "TabulaRazr"

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

def get_extension(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1] 

def allowed_file(filename):
    return get_extension(filename) in ALLOWED_EXTENSIONS

@app.route('/', methods=['GET', 'POST'])
def upload_file():

    if request.method == 'POST':
        
        file = request.files['file']
        project = request.form['project']
        
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            extension = get_extension(file.filename)
            path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)
            
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], project, filename))
            
            if extension == "pdf":
                txt_path = path+'.txt'
                filename += '.txt'        
                if not os.path.isfile(txt_path):
                    #Layout preservation crucial to preserve clues about tabular data
                    cmd = "pdftotext -enc UTF-8 -layout %s %s " % (path, txt_path)
                    os.system(cmd)            

            return redirect(url_for('analyze', filename=filename, project=project))

    return render_template('index.html',
        title=TITLE ,
        css=css)

@app.route('/analyze/<filename>', methods=['GET', 'POST'])
def analyze(filename):   

    project = request.args.get('project')
    txt_path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)
    
    if not os.path.isfile(txt_path):
        return {'error' : txt_path+' not found' }
    
    tables = return_tables(txt_path)
    
    #Export tables
    with codecs.open(txt_path + '.tables.json', 'w', "utf-8") as file:
        json.dump(tables, file)

    #Export chart
    lines_per_page = 80
    nr_data_rows = []
    #for t in tables.values():
    #    print t
    for key, t in tables.iteritems():
        e = t['end_line']
        b = t['begin_line']
        for l in range(b, e):
            page = l / lines_per_page
            if len(nr_data_rows) <= page:
                nr_data_rows += ([0]*(page-len(nr_data_rows)+1))
            nr_data_rows[page] += 1
    dr = pd.DataFrame()
    dr['value'] = nr_data_rows
    dr['page'] = range(0, len(dr))
    
    #plot the row density
    chart = filename+".png"
    fig, ax = plt.subplots( nrows=1, ncols=1, figsize=(8,3) )  # create figure & 1 axis
    ax.set_xlabel('page nr.')
    ax.set_ylabel('number of data rows')
    ax.set_title('Distribution of Rows with Data')
    ax.plot(dr['page'], dr['value'], )
    fig.savefig(txt_path + '.png')   # save the figure to file
    plt.close(fig)                      # close the figure

    if request.method == 'POST':
        return json.dumps(tables)
    
    return redirect(url_for('uploaded_file', filename=filename, project=project))
    

@app.route('/show/<filename>')
def uploaded_file(filename):

    project = request.args.get('project')    
    path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)
    
    tables_path = path + '.tables.json'
    chart_path = path+".png"
    
    if not os.path.isfile(tables_path):
        analyze(path)

    with codecs.open(tables_path) as file:
        tables = json.load(file)   

    #Create HTML
    notices = ['Extraction Results for ' + filename, 'Ordered by lines']    
    dfs = (table_to_df(table).to_html() for table in tables.values())
    headers = []
    for t in tables.values():
        if 'header' in t:
            headers.append(t['header'])
        else:
            headers.append('-')
    meta_data = [{'begin_line' : t['begin_line'], 'end_line' : t['end_line']} for t in tables.values()]

    return render_template('viewer.html',
        title=TITLE + ' - ' + filename,
        base_scripts=scripts, filename=filename, project=project,
        css=css, notices = notices, tables = dfs, headers=headers, meta_data=meta_data, chart=chart_path)

@app.route('/inspector/<filename>')
def inspector(filename):
    extension = 'txt'
    path = os.path.join(app.config['UPLOAD_FOLDER'], extension, filename)
    begin_line = int(request.args.get('data_begin'))
    end_line = int(request.args.get('data_end'))
    margin_top = config["meta_info_lines_above"]
    margin_bottom = margin_top
    
    notices = ['showing data lines from %i to %i with %i meta-lines above and below' % (begin_line, end_line, margin_top)]
    with codecs.open(path, "r", "utf-8") as file:
        lines = [l.encode('utf-8') for l in file][begin_line - margin_top:end_line + margin_bottom]
        top_lines = lines[:margin_top]
        table_lines = lines[margin_top:margin_top+end_line-begin_line]
        bottom_lines = lines[margin_top+end_line-begin_line:]
    
    offset = begin_line-margin_top
    table_id = begin_line
    
    return render_template('inspector.html',
        title=TITLE,
        base_scripts=scripts, css=css, notices = notices, filename=filename, top_lines=top_lines, 
        table_lines=table_lines, bottom_lines=bottom_lines, offset=offset, table_id=begin_line)

In [ ]:
def run_from_ipython():
    try:
        __IPYTHON__
        return True
    except NameError:
        return False

if run_from_ipython():
    app.run(host='0.0.0.0', port = 8080)
else:
    PORT = int(os.getenv('PORT', 8080))
    app.run(debug=True, host='0.0.0.0', port = PORT)


ERROR:__main__:Exception on / [POST]
Traceback (most recent call last):
  File "/Library/Python/2.7/site-packages/flask/app.py", line 1817, in wsgi_app
    response = self.full_dispatch_request()
  File "/Library/Python/2.7/site-packages/flask/app.py", line 1477, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/Library/Python/2.7/site-packages/flask/app.py", line 1381, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/Library/Python/2.7/site-packages/flask/app.py", line 1475, in full_dispatch_request
    rv = self.dispatch_request()
  File "/Library/Python/2.7/site-packages/flask/app.py", line 1461, in dispatch_request
    return self.view_functions[rule.endpoint](**req.view_args)
  File "<ipython-input-4-004e7bcd31d8>", line 36, in upload_file
    path = os.path.join(app.config['UPLOAD_FOLDER'], project, filename)
NameError: global name 'os' is not defined

In [ ]: