In [1]:

    
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
from sklearn import preprocessing
import subprocess
import matplotlib.pyplot as plt
from itertools import islice
import re



In [2]:

    
%matplotlib inline

The general view of the pipeline looks something like this :-

Feature Generation

Conversion of a pdf page image to components using RLSA. This basically runs from pixel to pixel and checks its surrounding if there is a 1 .i.e. if any pixel is black it will mark this pixel black. This will form block like structures that encapsulates the text region, lines etc. anything that is black.
For each block generate text using pdftotext this is a linux utility that exists. The only problem with this is it does not capture - dash at the moment, my guess is its mostly an option somewhere to be enabled.
Using the text information of each block, we run it through a few rules to cleanse the blocks, cleansing encapsulates the process of block separation at rows and columns level, removal based on the text that is present.

Marking Blocks

Based on the text values mark the number cells or as I like to call them cell_values .i.e. cell containing values of the sheet. This is generally simple to identifiy because of few attributes : -
- Its comma separated numbers
- Its generally followed by numbers or headers.
- It forms a grid like structure, which we later on exploit to get mark other components around it.
Mark other blocks as cell headers and groupings based on these marked cell_values and how these are relatively positioned to the block.

Table Generation

Detect rows and go row by row to generate each row and parsing the text encapsulated in the row.

Pasting functions from the older notebook for ease of use.



In [3]:

    
def get_page_image_from_pdf(pdf, page_num, image_file_name):
    page_layout = pdf.getPage(page_num)['/MediaBox']
    command = "convert -density 300 '%s'[%s] '%s'" % (pdf_file_path,
                                                      page_num,
                                                      image_file_name)
    subprocess.check_output(command, shell=True)
    return cv2.imread(image_file_name, 0)

def rlsa(img, threshold):
    ret,thresh1 = cv2.threshold(img, 0, 1, cv2.THRESH_BINARY_INV)
    img_iter = np.nditer(thresh1, flags=['multi_index'])
    C_vertical, C_horizontal = threshold
    temp_thresh = thresh1.copy()
    while not img_iter.finished:
        x, y = img_iter.multi_index
        x_threshold = x + C_horizontal
        y_threshold = y + C_vertical
        neg_x_threshold = x - C_horizontal
        neg_y_threshold = y - C_vertical
        if (thresh1[x:x_threshold, y:y_threshold].any() 
            or thresh1[x:x_threshold, y:neg_y_threshold].any()
            or thresh1[x:neg_x_threshold, y:y_threshold].any()
            or thresh1[x:neg_x_threshold, y:neg_y_threshold].any()):
            temp_thresh[x, y] = 1
        else:
            temp_thresh[x, y] = 0
        img_iter.iternext()
    return temp_thresh

def plot_page(img):
    plt.figure(figsize=(30,20))
    plt.imshow(img, cmap='gray')



In [4]:

    
def get_label_stats(stats, centroids):
    stats_columns = ["left", "top", "width", "height", "area"]
    label_stats = pd.DataFrame(stats, columns=stats_columns)
    label_stats['centroid_x'], label_stats['centroid_y'] = centroids[:, 0], centroids[:, 1]
    # Ignore the label 0 since it is the background
    label_stats.drop(0, inplace=True)
    return label_stats



In [5]:

    
def get_possible_boundary_lines(label_stats):
    possible_boundary_lines = label_stats[(label_stats.width > 1200) & (label_stats.top > 0)]
    possible_boundary_lines['dist_from_next_line'] = possible_boundary_lines.top.shift(-1) - possible_boundary_lines.top
    possible_boundary_lines['dist_from_prev_line'] = possible_boundary_lines.top - possible_boundary_lines.top.shift(1)
    possible_boundary_lines.replace(pd.np.nan, pd.np.inf, inplace=True)
    return possible_boundary_lines[(possible_boundary_lines.dist_from_next_line > 100) & (possible_boundary_lines.dist_from_prev_line > 100)]

def draw_table_bounding_box(img, label_stats):
    possible_boundary_lines = get_possible_boundary_lines(label_stats).reset_index()
    waste_copy = img.copy()
    tables = []
    if len(possible_boundary_lines) % 2 != 0:
        raise "Uneven Probable Bounding Lines"
    for index, row in enumerate(possible_boundary_lines.iterrows()):
        if index % 2 != 0:
            table_bounds = {}
            top_bound = possible_boundary_lines.iloc[index - 1]
            bottom_bound = possible_boundary_lines.iloc[index]
            cv2.rectangle(waste_copy, (int(top_bound['left']), int(top_bound['top'])), 
                          (int(bottom_bound['right']) + 50, int(bottom_bound['bottom'])), (125,125,0), 5)
            table_bounds['left'], table_bounds['top'] = int(top_bound['left']), int(top_bound['top'])
            table_bounds['right'], table_bounds['bottom'] = int(bottom_bound['right']) + 50, int(bottom_bound['bottom'])
            tables.append(table_bounds)
    plot_page(waste_copy)
    return tables

def get_text(page_num, pdf_file_name, x, y, w, h):
    command = 'pdftotext -enc Windows-1255 -f {0} -l {0} -x {1} -y {2} -W {3} -H {4} "{5}" -'.format(page_num + 1,
                                                                                 int(x), 
                                                                                 int(y),
                                                                                 int(w),
                                                                                 int(h),
                                                                                 pdf_file_name)
    return subprocess.check_output(command, shell=True)

def get_text_data(row, page_num, pdf_file_path, horizontal_ratio, vertical_ratio):
    x = (row['left'] * horizontal_ratio)
    y = (row['top'] * vertical_ratio)
    width = (row['width'] * horizontal_ratio) + 5
    hieght = (row['height'] * vertical_ratio) + 5
    text = get_text(page_num, pdf_file_path, x, y, width, hieght)
    if len(text.strip().replace('-','')) < 1:
        row['text'] = text.strip().replace('-','')
    else:
        row['text'] = text.strip()
    row['text_length'] = len(row['text'])
    row['possible_row_merger'] = '\n' in row['text']
    text_matched = re.findall('[a-zA-Z_]+', row['text'])
    comma_sep_matcher = re.compile('^[1-9].(,[0-9]).*$')
    if comma_sep_matcher.match(row['text'].replace('\n', ' ')):
        row['comma_separated_numbers_present'] = True
    else:
        row['comma_separated_numbers_present'] = False
    if len(text_matched) > 0:
        row['is_text'] = True
    else:
        row['is_text'] = False
        
    try:
        row['number'] = int(row['text'].replace(',',''))
    except:
        row['number'] = None
    return row

def mark_number_cells(row):
    if (row['comma_separated_numbers_present'] and row['is_text'] is False) or (row['text'] == '...'):
        row['label'] = 'cell_values'
    return row


def mark_text_cells(row, labels):
    '''
    Detect cells below, left and right. Based on these cells make guesses of labels.
    '''
    left, right = row['left'], row['right']
    top, bottom = row['top'], row['bottom']
    if row['label'] != 'cell_values':      
        if (labels[(labels.right.between(right - 10, right + 10)) & 
                     (labels.top > top) & (labels.label == 'cell_values')].shape[0] > 1):
            row['label'] = 'cell_header'
        elif (labels[(labels.top.between(top - 10, top + 10)) & 
                     (labels.left > left) & (labels.label == 'cell_values')].shape[0] > 1):
            row['label'] = 'cell_grouping'
        elif (labels[(labels.bottom.between(bottom - 10, bottom + 10)) & 
                     (labels.left > left) & (labels.label == 'cell_values')].shape[0] > 1):
            row['label'] = 'cell_grouping'
    return row
    

def get_possible_horizontal_lines(text_labels):
    possible_line_points = text_labels.bottom.value_counts()
    possible_line_points = possible_line_points[possible_line_points > 1]
    return possible_line_points.index



In [6]:

    
def mark_probable_headers(row, labels):
    left, top = row['left'], row['top']
    right, bottom = row['right'], row['bottom']
    labels = labels[labels.table == row['table']]
    # labels in a radius of 15
    x_pos_axis = labels[(labels.left.between(right - 10, right + 15)) &
                        (labels.label == 'cell_header')]
    x_neg_axis = labels[(labels.right.between(left - 15, left + 10)) &
                        (labels.label == 'cell_header')]
    y_pos_axis = labels[(labels.bottom.between(top - 20, top + 10)) &
                        (labels.label == 'cell_header')]
    y_neg_axis = labels[(labels.top.between(bottom - 10, bottom + 15)) &
                        (labels.label == 'cell_header')]

    if (len(x_pos_axis) + len(x_neg_axis) + len(y_pos_axis) + len(y_neg_axis)) > 0:
        if pd.isnull(row['label']):
            row['label'] = 'cell_header'
    return row

def find_higher_level_groupings(row, cell_grouping_height_range):
    bottom, label = row['bottom'], row['label']
    min_range, max_range = cell_grouping_height_range
    if (bottom > min_range) and (bottom < max_range) and (pd.isnull(label)):
        row['label'] = 'cell_higher_grouping'
    return row



In [7]:

    
def find_rows(labels):
    """
    Figure out the points where rows start and end.
    
    1. Headers would be the starting point. Bottom of the headers
    2. Each cell value will be separated by a row.
    """
    rows = []
    rows.extend(labels[labels['label'] == 'cell_header'].aggregate({'top': min, 'bottom': max}).values.flatten())
    rows.extend(labels[labels['label'] == 'cell_values']['bottom'].unique())
    filtered_rows = []
    for index, row_sep in enumerate(rows):
        if index > 0:
            last_row_sep = rows[index - 1]
            if row_sep > (last_row_sep + 20):
                filtered_rows.append(row_sep)
        else:
            filtered_rows.append(row_sep)
    return filtered_rows


def plot_horizontal_lines(line_ys, img, x_start, x_end):
    waste_copy2 = img.copy()
    for line_y in line_ys:
        line_starting_points = (int(x_start), int(line_y))
        line_stopping_points = (int(x_end), int(line_y))
        cv2.line(waste_copy2, line_starting_points, line_stopping_points, (125,255,0), 3)
    plot_page(waste_copy2)

def plot_vertical_lines(line_xs, img, y_start, y_end):
    waste_copy2 = img.copy()
    for line_x in line_xs:
        line_starting_points = (line_x, y_start)
        line_stopping_points = (line_x, y_end)
        cv2.line(waste_copy2, line_starting_points, line_stopping_points, (125,255,0), 3)
    plot_page(waste_copy2)
    
def find_common_headers(labels):
    unmarked_headers = labels[(labels.label == 'cell_header') & (pd.isnull(labels.header_index))]
    if len(unmarked_headers) > 0:
        header = unmarked_headers[unmarked_headers.top == unmarked_headers.top.min()]
        labels.ix[((labels.label == 'cell_header') & 
                   (labels.left.between(header.left.iloc[0] - header.width.iloc[0], header.right.iloc[0])) &
                   (pd.isnull(labels.header_index))
                  ), 'header_index'] = header.pos.iloc[0]
        return find_common_headers(labels)
    return labels
    
def cleanse_labels(labels):
    """
    We need to clean up labels to transform them into csv's
    
    1. Combine Headers so that they don't end up in new lines.
    2. Split label blocks with 2 values combined
    """
    processed_labels = pd.DataFrame()
    
    for index, row in labels.iterrows():
        splitted_row = []
        if row.possible_row_merger == True and  row.label != 'cell_header':
            for index, value in enumerate(row.text.split('\n')):
                new_row = {}
                for col in row.index:
                    new_row[col] = row[col]
                new_height = row.height // len(row.text.split('\n'))
                new_row['height'] = new_height
                new_row['top'] = row.top + (index * new_height)
                new_row['bottom'] = new_row['top'] + new_height
                new_row['text'] = value
                new_row['possible_row_merger'] = False
                splitted_row.append(new_row)
            processed_labels = processed_labels.append(splitted_row)
        else:
            processed_labels = processed_labels.append(row)
    return combine_horizontal(find_common_headers(processed_labels))


def combine_headers(labels):
    """
    1. Combine Blocks based on their horizontal distance.
    """
    processed_labels = pd.DataFrame()
    skip_pos = []
    for index, row in labels.iterrows():
        if row['pos'] not in skip_pos:
            nearby_header = labels[(labels.left.between(row['left'] - row['width'], row['right'])) &
                                   (labels.pos != row['pos']) &
                                   (labels.label == 'cell_header') &
                                   (labels.table == row['table'])].sort_values('top', ascending=True)
            if len(nearby_header) > 0 and row['label'] == 'cell_header':
                # if mergable create a common label and push the `pos` of
                # the row that is being merged into skip_pos
                row['text'] = row['text'] + ' ' + ' '.join(nearby_header.text.tolist())
                row['text'] = row['text'].replace('\n', ' ')
                row['width'] = row['width'] + nearby_header.width.sum()
                row['height'] = row['height'] + nearby_header.height.sum()
                row['left'] = min(row['left'], nearby_header.left.min())
                row['right'] = row['left'] + row['width']
                row['bottom'] = row['top'] + row['height']
                skip_pos.extend(nearby_header.pos.tolist())
            processed_labels = processed_labels.append(row)
    return processed_labels


def combine_horizontal(labels):
    """
    1. Combine Blocks based on their horizontal distance.
    """
    processed_labels = pd.DataFrame()
    skip_pos = []
    for index, row in labels.iterrows():
        if row['pos'] not in skip_pos:
            nearby_labels = labels[(labels.left.between(row['left'] - 5, row['right'] + 5)) &
                                   (labels.top.between(row['top'] -5, row['top'] + 5)) &
                                   (labels.pos != row['pos'])]
            if len(nearby_labels) > 0:
                # if mergable create a common label and push the `pos` of
                # the row that is being merged into skip_pos
                row['text'] = row['text'] + ' '.join(nearby_labels.text.tolist())
                row['width'] = row['width'] + nearby_labels.width.sum()
                row['right'] = row['left'] + row['width']
                skip_pos.extend(nearby_labels.pos.tolist())
            processed_labels = processed_labels.append(row)
    return processed_labels

        
def mark_titles(labels):
    labels.ix[((labels.is_text == True) & 
               (labels.centroid_x.between(1200, 1300)) &
               (pd.isnull(labels.label))
              ), 'label'] = 'title'
    return labels

def mark_summary(row):
    if row['is_text'] == True:
        summaries = re.findall("^[A-z, a-z]*\sRs. [0-9|,]*$", row['text'])
        if len(summaries) > 0:
            row['label'] = 'cell_summary'
    return row

def filter_out_footer(labels):
    bottom_boundary = labels[labels.label == 'cell_values']['bottom'].max()
    return labels[labels.top < bottom_boundary]

def mark_tables_using_titles(labels):
    titles = label_stats_with_text[label_stats_with_text.label == 'title']
    titles['next_diff'] = titles.top - titles.top.shift(1)
    seperators = titles[titles.next_diff > titles.next_diff.mean()]['top'].tolist()
    begin, end = labels.top.min(), labels.bottom.max()
    seperators = [begin] + seperators + [end]
    labels['table'] = None
    for index, sep in enumerate(seperators):
        if index > 0:
            table_start, table_end = seperators[index - 1], sep
            labels.loc[
                    labels['top'].between(table_start, table_end),
                    'table'
                ] = index
    return labels

def mark_rows(labels):
    for table in labels.table.unique():
        table_rows = labels[labels.table == table]
        rows = find_rows(table_rows)
        combined_rows = sorted(rows + table_rows[pd.isnull(table_rows.label)].bottom.unique().tolist())
        for index, row_start in enumerate(combined_rows[:-1]):
            row_end = combined_rows[index + 1]
            labels.loc[((labels.top.between(row_start, row_end)) & 
                        (labels.table == table)), 'row_index'] = index
    
    return labels



In [10]:

    
# Read PDF and page number
pdf_file_path = 'pdfs/west bengal/2017-18/2017_bp11_Demand Nos.1-5.pdf'
pdf = PdfFileReader(open(pdf_file_path, 'rb'))
page_layout = pdf.getPage(3)['/MediaBox']
if '/Rotate' in pdf.getPage(3) and pdf.getPage(3)['/Rotate'] == 90:
    page_width = float(page_layout[3])
    page_height = float(page_layout[2])
else:
    page_width = float(page_layout[2])
    page_height = float(page_layout[3])
img_page = get_page_image_from_pdf(pdf, 2, 'west_bengal_demand_1_5_page_1.png')
image_height, image_width = img_page.shape
horizontal_ratio = page_width / image_width
vertical_ratio = page_height / image_height
# Convert to components
img_rlsa = rlsa(img_page, (20, 25))
# extract geometrical features
n_comp, labels, stats, centroids = cv2.connectedComponentsWithStats(img_rlsa)
label_stats = get_label_stats(stats, centroids)
label_stats['right'] = label_stats.left + label_stats.width
label_stats['bottom'] = label_stats.top + label_stats.height



In [11]:

    
# extract text based features and some basic cleaning
label_stats_with_text = label_stats.apply(get_text_data, axis=1, args=[2, pdf_file_path, horizontal_ratio, vertical_ratio])



In [12]:

    
# Mark All components as Text and Number
label_stats_with_text = label_stats_with_text[label_stats_with_text.text_length > 0].apply(mark_number_cells, axis=1)
label_stats_with_text = label_stats_with_text.apply(mark_text_cells, axis=1, args=[label_stats_with_text])
# Set some required columns that will be set later on.
label_stats_with_text['header_index'] = None
label_stats_with_text['pos'] = label_stats_with_text.index
# Clean up labels
label_stats_with_text = cleanse_labels(label_stats_with_text)
# Mark summaries, titles
# label_stats_with_text = mark_probable_headers(label_stats_with_text)
label_stats_with_text = mark_titles(label_stats_with_text)
label_stats_with_text = label_stats_with_text.apply(mark_summary, axis=1)
# Filter out excessive information
label_stats_with_text = filter_out_footer(label_stats_with_text)
label_stats_with_text









    



DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated [ipykernel_launcher.py:45]
DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated [ipykernel_launcher.py:131]






    Out[12]:







  
    
      
      area
      bottom
      centroid_x
      centroid_y
      comma_separated_numbers_present
      header_index
      height
      is_text
      label
      left
      number
      pos
      possible_row_merger
      right
      text
      text_length
      top
      width
    
  
  
    
      4
      42076.0
      165.0
      1229.127793
      133.112392
      0.0
      NaN
      62.0
      1.0
      title
      885.0
      NaN
      4.0
      0.0
      1575.0
      REVENUE EXPENDITURE
      19.0
      103.0
      690.0
    
    
      6
      19420.0
      234.0
      1225.853708
      205.540783
      0.0
      NaN
      57.0
      1.0
      title
      1049.0
      NaN
      6.0
      0.0
      1409.0
      DEMAND No. 01
      13.0
      177.0
      360.0
    
    
      8
      34215.0
      312.0
      1225.751308
      277.009908
      0.0
      NaN
      66.0
      1.0
      title
      901.0
      NaN
      8.0
      0.0
      1559.0
      Legislative Assembly Secretariat
      32.0
      246.0
      658.0
    
    
      10
      41069.0
      381.0
      1232.540627
      346.063040
      0.0
      NaN
      66.0
      1.0
      title
      827.0
      NaN
      10.0
      0.0
      1632.0
      A-General Services - (a) Organs of State
      40.0
      315.0
      805.0
    
    
      12
      44366.0
      449.0
      1228.446445
      413.491277
      0.0
      NaN
      66.0
      1.0
      title
      798.0
      NaN
      12.0
      0.0
      1662.0
      Head of Account : 2011 - State Legislatures
      43.0
      383.0
      864.0
    
    
      14
      25409.0
      567.0
      379.711559
      531.492739
      0.0
      NaN
      65.0
      1.0
      cell_summary
      142.0
      NaN
      14.0
      0.0
      604.0
      Voted Rs. 61,50,66,000
      22.0
      502.0
      462.0
    
    
      15
      24421.0
      568.0
      1242.827566
      532.310348
      0.0
      NaN
      66.0
      1.0
      cell_summary
      1016.0
      NaN
      15.0
      0.0
      1462.0
      Charged Rs. 59,10,000
      21.0
      502.0
      446.0
    
    
      16
      24894.0
      567.0
      2098.311762
      531.411304
      0.0
      16.0
      65.0
      1.0
      cell_summary
      1869.0
      NaN
      16.0
      0.0
      2317.0
      Total Rs. 62,09,76,000
      22.0
      502.0
      448.0
    
    
      20
      9843.0
      679.0
      1933.956416
      648.331505
      0.0
      16.0
      59.0
      1.0
      cell_header
      1833.0
      NaN
      20.0
      0.0
      2039.0
      Charged Rs.
      11.0
      620.0
      206.0
    
    
      21
      8152.0
      672.0
      1678.513494
      646.764475
      0.0
      16.0
      51.0
      1.0
      cell_header
      1593.0
      NaN
      21.0
      0.0
      1767.0
      Voted Rs.
      9.0
      621.0
      174.0
    
    
      22
      7692.0
      672.0
      2232.769501
      646.766381
      0.0
      16.0
      51.0
      1.0
      cell_header
      2154.0
      NaN
      22.0
      0.0
      2316.0
      Total Rs.
      9.0
      621.0
      162.0
    
    
      26
      14864.0
      791.0
      775.951023
      760.459634
      0.0
      NaN
      58.0
      1.0
      cell_grouping
      618.0
      NaN
      26.0
      0.0
      938.0
      Gross Expenditure
      17.0
      733.0
      320.0
    
    
      0
      19317.0
      789.0
      1671.421235
      782.727390
      1.0
      NaN
      56.0
      0.0
      cell_values
      1553.0
      NaN
      27.0
      0.0
      1767.0
      61,50,66,000
      22.0
      733.0
      214.0
    
    
      1
      19317.0
      845.0
      1671.421235
      782.727390
      1.0
      NaN
      56.0
      0.0
      cell_values
      1553.0
      NaN
      27.0
      0.0
      1767.0
      -2,61,000
      22.0
      789.0
      214.0
    
    
      28
      8747.0
      790.0
      1957.048588
      759.089859
      1.0
      NaN
      57.0
      0.0
      cell_values
      1873.0
      5910000.0
      28.0
      0.0
      2042.0
      59,10,000
      9.0
      733.0
      169.0
    
    
      0
      19346.0
      789.0
      2221.338261
      782.709604
      1.0
      NaN
      56.0
      0.0
      cell_values
      2103.0
      NaN
      29.0
      0.0
      2317.0
      62,09,76,000
      22.0
      733.0
      214.0
    
    
      1
      19346.0
      845.0
      2221.338261
      782.709604
      1.0
      NaN
      56.0
      0.0
      cell_values
      2103.0
      NaN
      29.0
      0.0
      2317.0
      -2,61,000
      22.0
      789.0
      214.0
    
    
      30
      6448.0
      840.0
      640.050093
      816.051489
      0.0
      NaN
      51.0
      1.0
      cell_grouping
      570.0
      NaN
      30.0
      0.0
      909.0
      Deduct -Recoveries
      8.0
      789.0
      339.0
    
    
      32
      1311.0
      840.0
      2016.483600
      824.467582
      0.0
      NaN
      30.0
      0.0
      cell_values
      1995.0
      NaN
      32.0
      0.0
      2039.0
      ...
      3.0
      810.0
      44.0
    
    
      36
      13501.0
      960.0
      857.588253
      929.038368
      0.0
      NaN
      58.0
      1.0
      cell_grouping
      720.0
      NaN
      36.0
      0.0
      1004.0
      Net Expenditure
      15.0
      902.0
      284.0
    
    
      37
      11192.0
      959.0
      1659.620622
      928.225965
      1.0
      NaN
      57.0
      0.0
      cell_values
      1553.0
      614805000.0
      37.0
      0.0
      1767.0
      61,48,05,000
      12.0
      902.0
      214.0
    
    
      38
      8747.0
      959.0
      1957.048588
      928.089859
      1.0
      NaN
      57.0
      0.0
      cell_values
      1873.0
      5910000.0
      38.0
      0.0
      2042.0
      59,10,000
      9.0
      902.0
      169.0
    
    
      39
      11226.0
      959.0
      2209.406378
      928.139319
      1.0
      NaN
      57.0
      0.0
      cell_values
      2103.0
      620715000.0
      39.0
      0.0
      2317.0
      62,07,15,000
      12.0
      902.0
      214.0
    
    
      43
      37051.0
      1084.0
      1229.191007
      1053.613938
      0.0
      NaN
      59.0
      1.0
      title
      911.0
      NaN
      43.0
      0.0
      1549.0
      REVENUE EXPENDITURE
      19.0
      1025.0
      638.0
    
    
      45
      20972.0
      1140.0
      1230.401821
      1113.837784
      0.0
      NaN
      51.0
      1.0
      title
      1019.0
      NaN
      45.0
      0.0
      1440.0
      ABSTRACT ACCOUNT
      16.0
      1089.0
      421.0
    
    
      49
      6212.0
      1261.0
      1675.519961
      1229.796523
      0.0
      NaN
      59.0
      1.0
      NaN
      1613.0
      NaN
      49.0
      0.0
      1740.0
      Budget
      6.0
      1202.0
      127.0
    
    
      50
      6578.0
      1253.0
      1943.899970
      1228.496047
      0.0
      NaN
      51.0
      1.0
      NaN
      1875.0
      NaN
      50.0
      0.0
      2015.0
      Revised
      7.0
      1202.0
      140.0
    
    
      51
      6254.0
      1261.0
      2225.048129
      1229.807963
      0.0
      NaN
      59.0
      1.0
      NaN
      2162.0
      NaN
      51.0
      0.0
      2290.0
      Budget
      6.0
      1202.0
      128.0
    
    
      52
      15406.0
      1365.0
      1401.864858
      1316.907763
      0.0
      52.0
      107.0
      1.0
      cell_header
      1313.0
      NaN
      52.0
      1.0
      1492.0
      Actuals,\n2015-2016
      18.0
      1258.0
      179.0
    
    
      53
      16239.0
      1365.0
      1675.118111
      1314.721535
      0.0
      16.0
      107.0
      1.0
      cell_header
      1587.0
      NaN
      53.0
      1.0
      1766.0
      Estimate,\n2016-2017
      19.0
      1258.0
      179.0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      103
      15930.0
      2290.0
      1042.681293
      2265.474451
      0.0
      NaN
      51.0
      1.0
      cell_grouping
      874.0
      NaN
      103.0
      0.0
      1218.0
      Grand Total - Gross
      19.0
      2239.0
      344.0
    
    
      104
      11140.0
      2296.0
      1384.866876
      2265.031688
      1.0
      NaN
      57.0
      0.0
      cell_values
      1277.0
      412064235.0
      104.0
      0.0
      1492.0
      41,20,64,235
      12.0
      2239.0
      215.0
    
    
      105
      11232.0
      2296.0
      1659.334046
      2265.156428
      1.0
      NaN
      57.0
      0.0
      cell_values
      1553.0
      608951000.0
      105.0
      0.0
      1767.0
      60,89,51,000
      12.0
      2239.0
      214.0
    
    
      106
      11284.0
      2296.0
      1933.903314
      2265.162708
      1.0
      NaN
      57.0
      0.0
      cell_values
      1827.0
      563709000.0
      106.0
      0.0
      2042.0
      56,37,09,000
      12.0
      2239.0
      215.0
    
    
      107
      11232.0
      2296.0
      2209.386485
      2265.143162
      1.0
      NaN
      57.0
      0.0
      cell_values
      2103.0
      620976000.0
      107.0
      0.0
      2317.0
      62,09,76,000
      12.0
      2239.0
      214.0
    
    
      111
      5085.0
      2403.0
      1163.480826
      2377.670403
      0.0
      NaN
      51.0
      1.0
      cell_grouping
      1108.0
      NaN
      111.0
      0.0
      1218.0
      Voted
      5.0
      2352.0
      110.0
    
    
      0
      19416.0
      2407.0
      1395.550783
      2402.243871
      1.0
      NaN
      55.0
      0.0
      cell_values
      1277.0
      NaN
      112.0
      0.0
      1493.0
      41,04,44,905
      22.0
      2352.0
      216.0
    
    
      1
      19416.0
      2462.0
      1395.550783
      2402.243871
      1.0
      NaN
      55.0
      0.0
      cell_values
      1277.0
      NaN
      112.0
      0.0
      1493.0
      16,19,330
      22.0
      2407.0
      216.0
    
    
      0
      19639.0
      2407.0
      1669.992158
      2402.199145
      1.0
      NaN
      55.0
      0.0
      cell_values
      1553.0
      NaN
      113.0
      0.0
      1768.0
      60,34,24,000
      22.0
      2352.0
      215.0
    
    
      1
      19639.0
      2462.0
      1669.992158
      2402.199145
      1.0
      NaN
      55.0
      0.0
      cell_values
      1553.0
      NaN
      113.0
      0.0
      1768.0
      55,27,000
      22.0
      2407.0
      215.0
    
    
      0
      19683.0
      2407.0
      1944.925926
      2402.218107
      1.0
      NaN
      55.0
      0.0
      cell_values
      1828.0
      NaN
      114.0
      0.0
      2043.0
      55,81,28,000
      22.0
      2352.0
      215.0
    
    
      1
      19683.0
      2462.0
      1944.925926
      2402.218107
      1.0
      NaN
      55.0
      0.0
      cell_values
      1828.0
      NaN
      114.0
      0.0
      2043.0
      55,81,000
      22.0
      2407.0
      215.0
    
    
      0
      19667.0
      2407.0
      2220.060558
      2402.283216
      1.0
      NaN
      55.0
      0.0
      cell_values
      2103.0
      NaN
      115.0
      0.0
      2318.0
      61,50,66,000
      22.0
      2352.0
      215.0
    
    
      1
      19667.0
      2462.0
      2220.060558
      2402.283216
      1.0
      NaN
      55.0
      0.0
      cell_values
      2103.0
      NaN
      115.0
      0.0
      2318.0
      59,10,000
      22.0
      2407.0
      215.0
    
    
      116
      7042.0
      2466.0
      1143.765124
      2435.896904
      0.0
      NaN
      58.0
      1.0
      cell_grouping
      1070.0
      NaN
      116.0
      0.0
      1219.0
      Charged
      7.0
      2408.0
      149.0
    
    
      120
      11604.0
      2572.0
      1090.725009
      2546.832644
      0.0
      NaN
      51.0
      1.0
      cell_grouping
      969.0
      NaN
      120.0
      0.0
      1218.0
      NP - Non Plan
      13.0
      2521.0
      249.0
    
    
      121
      11140.0
      2578.0
      1384.866876
      2547.031688
      1.0
      NaN
      57.0
      0.0
      cell_values
      1277.0
      412064235.0
      121.0
      0.0
      1492.0
      41,20,64,235
      12.0
      2521.0
      215.0
    
    
      122
      11232.0
      2578.0
      1659.334046
      2547.156428
      1.0
      NaN
      57.0
      0.0
      cell_values
      1553.0
      608951000.0
      122.0
      0.0
      1767.0
      60,89,51,000
      12.0
      2521.0
      214.0
    
    
      123
      11284.0
      2578.0
      1933.903314
      2547.162708
      1.0
      NaN
      57.0
      0.0
      cell_values
      1827.0
      563709000.0
      123.0
      0.0
      2042.0
      56,37,09,000
      12.0
      2521.0
      215.0
    
    
      124
      11232.0
      2578.0
      2209.386485
      2547.143162
      1.0
      NaN
      57.0
      0.0
      cell_values
      2103.0
      620976000.0
      124.0
      0.0
      2317.0
      62,09,76,000
      12.0
      2521.0
      214.0
    
    
      128
      5085.0
      2684.0
      1163.480826
      2658.670403
      0.0
      NaN
      51.0
      1.0
      cell_grouping
      1108.0
      NaN
      128.0
      0.0
      1218.0
      Voted
      5.0
      2633.0
      110.0
    
    
      0
      19416.0
      2688.0
      1395.550783
      2683.243871
      1.0
      NaN
      55.0
      0.0
      cell_values
      1277.0
      NaN
      129.0
      0.0
      1493.0
      41,04,44,905
      22.0
      2633.0
      216.0
    
    
      1
      19416.0
      2743.0
      1395.550783
      2683.243871
      1.0
      NaN
      55.0
      0.0
      cell_values
      1277.0
      NaN
      129.0
      0.0
      1493.0
      16,19,330
      22.0
      2688.0
      216.0
    
    
      0
      19639.0
      2688.0
      1669.992158
      2683.199145
      1.0
      NaN
      55.0
      0.0
      cell_values
      1553.0
      NaN
      130.0
      0.0
      1768.0
      60,34,24,000
      22.0
      2633.0
      215.0
    
    
      1
      19639.0
      2743.0
      1669.992158
      2683.199145
      1.0
      NaN
      55.0
      0.0
      cell_values
      1553.0
      NaN
      130.0
      0.0
      1768.0
      55,27,000
      22.0
      2688.0
      215.0
    
    
      0
      19683.0
      2688.0
      1944.925926
      2683.218107
      1.0
      NaN
      55.0
      0.0
      cell_values
      1828.0
      NaN
      131.0
      0.0
      2043.0
      55,81,28,000
      22.0
      2633.0
      215.0
    
    
      1
      19683.0
      2743.0
      1944.925926
      2683.218107
      1.0
      NaN
      55.0
      0.0
      cell_values
      1828.0
      NaN
      131.0
      0.0
      2043.0
      55,81,000
      22.0
      2688.0
      215.0
    
    
      0
      19667.0
      2688.0
      2220.060558
      2683.283216
      1.0
      NaN
      55.0
      0.0
      cell_values
      2103.0
      NaN
      132.0
      0.0
      2318.0
      61,50,66,000
      22.0
      2633.0
      215.0
    
    
      1
      19667.0
      2743.0
      2220.060558
      2683.283216
      1.0
      NaN
      55.0
      0.0
      cell_values
      2103.0
      NaN
      132.0
      0.0
      2318.0
      59,10,000
      22.0
      2688.0
      215.0
    
    
      133
      7042.0
      2747.0
      1143.765124
      2716.896904
      0.0
      NaN
      58.0
      1.0
      cell_grouping
      1070.0
      NaN
      133.0
      0.0
      1219.0
      Charged
      7.0
      2689.0
      149.0
    
  

101 rows × 18 columns



In [13]:

    
# Separate out Tables and find rows and columns
label_stats_with_text = mark_tables_using_titles(label_stats_with_text)









    



SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy [ipykernel_launcher.py:147]



In [14]:

    
label_stats_with_text = combine_headers(label_stats_with_text.apply(mark_probable_headers, args=[label_stats_with_text], axis=1))



In [15]:

    
# Separate out Tables and find rows.
rows = []
for table_no in label_stats_with_text.table.unique().tolist():
    temp_rows = find_rows(label_stats_with_text[label_stats_with_text.table == table_no])
    print(temp_rows)
    rows.extend(temp_rows)









    



[620.0, 679.0, 789.0, 845.0, 840.0, 959.0]
[1202.0, 1418.0, 1651.0, 1706.0, 1821.0, 2001.0, 2056.0, 2171.0, 2296.0, 2407.0, 2462.0, 2578.0, 2688.0, 2743.0]



In [16]:

    
plot_horizontal_lines(rows[1:], img_page, label_stats_with_text.left.min(), label_stats_with_text.right.max())

Before start processing each row we need to handle some of the components/labels that are still marked as NaN. At the moment I am just giving each one of them a separate row.



In [17]:

    
combined_rows = rows + label_stats_with_text[pd.isnull(label_stats_with_text.label)].bottom.unique().tolist()
plot_horizontal_lines(combined_rows, img_page, label_stats_with_text.left.min(), label_stats_with_text.right.max())



In [18]:

    
mark_rows(label_stats_with_text).groupby(['table', 'row_index']).apply(lambda x: len(sorted(x['left'].unique())))









    Out[18]:





table  row_index
1.0    0.0          3
       1.0          4
       2.0          4
       4.0          4
2.0    0.0          4
       1.0          1
       2.0          1
       3.0          6
       5.0          5
       6.0          5
       7.0          1
       8.0          6
       10.0         5
       11.0         5
       12.0         5
       13.0         5
       14.0         5
       15.0         5
       16.0         5
       17.0         5
dtype: int64



In [19]:

    
mark_rows(label_stats_with_text).groupby(['table', 'row_index']).apply(lambda x: len(sorted(x['left'].unique()))).idxmax()









    Out[19]:





(2.0, 3.0)



In [20]:

    
mark_rows(label_stats_with_text).groupby(['table', 'row_index']).apply(lambda x: sorted(x['right'].unique()))









    Out[20]:





table  row_index
1.0    0.0                                 [1767.0, 2039.0, 2316.0]
       1.0                          [938.0, 1767.0, 2042.0, 2317.0]
       2.0                          [909.0, 1767.0, 2039.0, 2317.0]
       4.0                         [1004.0, 1767.0, 2042.0, 2317.0]
2.0    0.0                         [1557.0, 1958.0, 2247.0, 2510.0]
       1.0                                                  [660.0]
       2.0                                                  [577.0]
       3.0          [577.0, 1217.0, 1494.0, 1768.0, 2043.0, 2318.0]
       5.0                 [1219.0, 1494.0, 1768.0, 2043.0, 2318.0]
       6.0                 [1216.0, 1492.0, 1767.0, 2042.0, 2317.0]
       7.0                                                  [601.0]
       8.0          [601.0, 1217.0, 1492.0, 1768.0, 2043.0, 2318.0]
       10.0                [1219.0, 1492.0, 1768.0, 2043.0, 2318.0]
       11.0                [1217.0, 1492.0, 1767.0, 2042.0, 2317.0]
       12.0                [1218.0, 1492.0, 1767.0, 2042.0, 2317.0]
       13.0                [1218.0, 1493.0, 1768.0, 2043.0, 2318.0]
       14.0                [1219.0, 1493.0, 1768.0, 2043.0, 2318.0]
       15.0                [1218.0, 1492.0, 1767.0, 2042.0, 2317.0]
       16.0                [1218.0, 1493.0, 1768.0, 2043.0, 2318.0]
       17.0                [1219.0, 1493.0, 1768.0, 2043.0, 2318.0]
dtype: object



In [21]:

    
def filter_possible_cols(cols, img, table_start, table_end):
    columns_df = pd.DataFrame({'cols': cols})
    columns_df['dark_pixel_count'] = columns_df.cols.apply(lambda x: (img[table_start:table_end, int(x)] < 255).sum())
    columns_df['next_diff'] = columns_df.cols.shift(-1) - columns_df.cols
    filtered_columns_df = columns_df[columns_df['dark_pixel_count'] < 50]
    return filtered_columns_df

def find_columns(labels_with_rows):
    for table in labels_with_rows.table.unique():
        table_rows = labels_with_rows[labels_with_rows.table == table]
        row_with_max_cols = table_rows.groupby(['row_index']).apply(lambda x: len(sorted(x['right'].unique()))).idxmax()
        possible_cols = table_rows.groupby(['row_index']).apply(lambda x: sorted(x['right'].unique())).loc[row_with_max_cols]
        table_start, table_end = table_rows.agg({'top':min, 'bottom': max}).values.flatten()
        cols = filter_possible_cols(possible_cols, img_page, int(table_start), int(table_end))
        table_start, table_end = table_rows.left.min(), table_rows.right.max()
        table_cols = [table_start] + sorted(possible_cols) + [table_end]
        print(table_cols)
        for index, col_start in enumerate(table_cols[:-1]):
            col_end = table_cols[index + 1]
            labels_with_rows.loc[(labels_with_rows.left.between(col_start, col_end) & 
                                 (labels_with_rows.table == table)), 'col_index'] = index
        
    return labels_with_rows


def mark_rows_and_cols(labels):
    labels_with_rows = mark_rows(label_stats_with_text)
    return find_columns(labels_with_rows)



In [22]:

    
label_stats_with_row_col = mark_rows_and_cols(label_stats_with_text)









    



[142.0, 938.0, 1767.0, 2042.0, 2317.0, 2317.0]
[143.0, 577.0, 1217.0, 1494.0, 1768.0, 2043.0, 2318.0, 2510.0]



In [23]:

    
table_one = label_stats_with_row_col[(label_stats_with_row_col.table == 1) &
                                     (pd.notnull(label_stats_with_row_col.row_index))]



In [24]:

    
for _, group in table_one.sort_values('top').groupby('row_index'):
    print(group.sort_values('left').text.values)
    print(group.sort_values('left').col_index.values)









    



['Voted Rs.' 'Charged Rs.' 'Total Rs.']
[ 1.  2.  3.]
['Gross Expenditure' '61,50,66,000' '59,10,000' '62,09,76,000']
[ 0.  1.  2.  3.]
['Deduct -Recoveries' '-2,61,000' '...' '-2,61,000']
[ 0.  1.  2.  3.]
['Net Expenditure' '61,48,05,000' '59,10,000' '62,07,15,000']
[ 0.  1.  2.  3.]



In [25]:

    
table_two = label_stats_with_row_col[(label_stats_with_row_col.table == 2) &
                                     (pd.notnull(label_stats_with_row_col.row_index))]
for _, group in table_two.sort_values('top').groupby('row_index'):
    print(group.sort_values('left').text.values)
    print(group.sort_values('left').col_index.values)









    



['Actuals, 2015-2016 Rs.' 'Budget Estimate, 2016-2017 Rs.'
 'Revised Estimate, 2016-2017 Rs.' 'Budget Estimate, 2017-2018 Rs.']
[ 2.  3.  4.  5.]
['02 -STATE LEGISLATURES']
[ 0.]
['101- Legislative Assembly']
[ 0.]
['NP-Non Plan' 'Voted' '17,20,13,766' '26,26,46,000' '27,39,50,000'
 '28,79,98,000']
[ 0.  1.  2.  3.  4.  5.]
['Charged' '11,26,566' '39,97,000' '40,51,000' '42,79,000']
[ 1.  2.  3.  4.  5.]
['Total - 101' '17,31,40,332' '26,66,43,000' '27,80,01,000' '29,22,77,000']
[ 1.  2.  3.  4.  5.]
['103- Legislative Secretariat']
[ 0.]
['NP-Non Plan' 'Voted' '23,84,31,139' '34,07,78,000' '28,41,78,000'
 '32,70,68,000']
[ 0.  1.  2.  3.  4.  5.]
['Charged' '4,92,764' '15,30,000' '15,30,000' '16,31,000']
[ 1.  2.  3.  4.  5.]
['Total - 103' '23,89,23,903' '34,23,08,000' '28,57,08,000' '32,86,99,000']
[ 1.  2.  3.  4.  5.]
['Grand Total - Gross' '41,20,64,235' '60,89,51,000' '56,37,09,000'
 '62,09,76,000']
[ 1.  2.  3.  4.  5.]
['Voted' '41,04,44,905' '60,34,24,000' '55,81,28,000' '61,50,66,000']
[ 1.  2.  3.  4.  5.]
['Charged' '16,19,330' '55,27,000' '55,81,000' '59,10,000']
[ 1.  2.  3.  4.  5.]
['NP - Non Plan' '41,20,64,235' '60,89,51,000' '56,37,09,000'
 '62,09,76,000']
[ 1.  2.  3.  4.  5.]
['Voted' '41,04,44,905' '60,34,24,000' '55,81,28,000' '61,50,66,000']
[ 1.  2.  3.  4.  5.]
['Charged' '16,19,330' '55,27,000' '55,81,000' '59,10,000']
[ 1.  2.  3.  4.  5.]



In [26]:

    
plot_vertical_lines([1010, 1770, 2050, 2320], img_page, 500, 1000)



In [27]:

    
plot_vertical_lines([600, 1217, 1549, 1768, 2043, 2318], img_page, 1200, 2900)

Convert to csv



In [28]:

    
for table_no in label_stats_with_row_col.table.unique():
    table_labels = label_stats_with_row_col[label_stats_with_row_col.table == table_no]
    title = '_'.join(table_labels[table_labels.label == 'title']['text'].tolist()).replace(' ', '_')
    filename = title + '.csv'
    max_col = int(table_labels.col_index.max())
    with open(filename, 'w') as csv_file:
        # for each row write a line
        for _, group in table_labels.sort_values('top').groupby('row_index'):
            row = ''
            for column_index in range(max_col):
                value = group[group.col_index == column_index]
                if len(value.index) == 0:
                    row += ' ;'
                else:
                    row += value.text.iloc[0] + ';'
            csv_file.write(row)
            csv_file.write('\n')
            print(row)









    



 ;Voted Rs.;Charged Rs.;
Gross Expenditure;61,50,66,000;59,10,000;
Deduct -Recoveries;-2,61,000;...;
Net Expenditure;61,48,05,000;59,10,000;
 ; ;Actuals, 2015-2016 Rs.;Budget Estimate, 2016-2017 Rs.;Revised Estimate, 2016-2017 Rs.;
02 -STATE LEGISLATURES; ; ; ; ;
101- Legislative Assembly; ; ; ; ;
NP-Non Plan;Voted;17,20,13,766;26,26,46,000;27,39,50,000;
 ;Charged;11,26,566;39,97,000;40,51,000;
 ;Total - 101;17,31,40,332;26,66,43,000;27,80,01,000;
103- Legislative Secretariat; ; ; ; ;
NP-Non Plan;Voted;23,84,31,139;34,07,78,000;28,41,78,000;
 ;Charged;4,92,764;15,30,000;15,30,000;
 ;Total - 103;23,89,23,903;34,23,08,000;28,57,08,000;
 ;Grand Total - Gross;41,20,64,235;60,89,51,000;56,37,09,000;
 ;Voted;41,04,44,905;60,34,24,000;55,81,28,000;
 ;Charged;16,19,330;55,27,000;55,81,000;
 ;NP - Non Plan;41,20,64,235;60,89,51,000;56,37,09,000;
 ;Voted;41,04,44,905;60,34,24,000;55,81,28,000;
 ;Charged;16,19,330;55,27,000;55,81,000;



In [33]:

    
!ls









    



'Block detection Sample generation.ipynb'
 cbga_parsers
'Command generation.ipynb'
 current_method_table_boundary.png
'horizontal boundaries blob detection.png'
 karnataka
 man-bash.pdf
 measure_sample.csv
 odisha
 page4_tags.csv
 parsers
 pdf_explorer.py
 pdfs
'PDF to CSV Block Method Pipeline.ipynb'
'PDF To CSV Pipeline - Combining the blocks.ipynb'
 perf_monit.log
 python2_cbga_parsers
 REVENUE_EXPENDITURE_ABSTRACT_ACCOUNT.csv
'REVENUE_EXPENDITURE_DEMAND_No._01_Legislative_Assembly_Secretariat_A-General_Services_-_(a)_Organs_of_State_Head_of_Account_:_2011_-_State_Legislatures.csv'
'REVENUE_EXPENDITURE_DEMAND_No._01_Legislative_Assembly_Secretariat_AGeneral_Services__(a)_Organs_of_State_Head_of_Account_:_2011__State_Legislatures.csv'
 sample.csv
 sample.png
 sample_text_labels.csv
 sample_workflow.png
 test.txt
 wb_parsed_files.csv
 wb_sample
 west_bengal_demand_1_5_page_1.png
 west_bengal_parsed_pdf



In [36]:

    
pd.read_csv('REVENUE_EXPENDITURE_ABSTRACT_ACCOUNT.csv')









    



---------------------------------------------------------------------------
ParserError                               Traceback (most recent call last)
<ipython-input-36-6b0ffb2822b9> in <module>()
----> 1 pd.read_csv('REVENUE_EXPENDITURE_ABSTRACT_ACCOUNT.csv')

/home/jayant/projects/cbga/cbga_parsers/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

/home/jayant/projects/cbga/cbga_parsers/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    409 
    410     try:
--> 411         data = parser.read(nrows)
    412     finally:
    413         parser.close()

/home/jayant/projects/cbga/cbga_parsers/lib/python2.7/site-packages/pandas/io/parsers.pyc in read(self, nrows)
   1003                 raise ValueError('skipfooter not supported for iteration')
   1004 
-> 1005         ret = self._engine.read(nrows)
   1006 
   1007         if self.options.get('as_recarray'):

/home/jayant/projects/cbga/cbga_parsers/lib/python2.7/site-packages/pandas/io/parsers.pyc in read(self, nrows)
   1746     def read(self, nrows=None):
   1747         try:
-> 1748             data = self._reader.read(nrows)
   1749         except StopIteration:
   1750             if self._first_chunk:

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.read (pandas/_libs/parsers.c:10862)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory (pandas/_libs/parsers.c:11138)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._read_rows (pandas/_libs/parsers.c:11884)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows (pandas/_libs/parsers.c:11755)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.raise_parser_error (pandas/_libs/parsers.c:28765)()

ParserError: Error tokenizing data. C error: Expected 4 fields in line 4, saw 10



In [ ]:

	area	bottom	centroid_x	centroid_y	comma_separated_numbers_present	header_index	height	is_text	label	left	number	pos	possible_row_merger	right	text	text_length	top	width
4	42076.0	165.0	1229.127793	133.112392	0.0	NaN	62.0	1.0	title	885.0	NaN	4.0	0.0	1575.0	REVENUE EXPENDITURE	19.0	103.0	690.0
6	19420.0	234.0	1225.853708	205.540783	0.0	NaN	57.0	1.0	title	1049.0	NaN	6.0	0.0	1409.0	DEMAND No. 01	13.0	177.0	360.0
8	34215.0	312.0	1225.751308	277.009908	0.0	NaN	66.0	1.0	title	901.0	NaN	8.0	0.0	1559.0	Legislative Assembly Secretariat	32.0	246.0	658.0
10	41069.0	381.0	1232.540627	346.063040	0.0	NaN	66.0	1.0	title	827.0	NaN	10.0	0.0	1632.0	A-General Services - (a) Organs of State	40.0	315.0	805.0
12	44366.0	449.0	1228.446445	413.491277	0.0	NaN	66.0	1.0	title	798.0	NaN	12.0	0.0	1662.0	Head of Account : 2011 - State Legislatures	43.0	383.0	864.0
14	25409.0	567.0	379.711559	531.492739	0.0	NaN	65.0	1.0	cell_summary	142.0	NaN	14.0	0.0	604.0	Voted Rs. 61,50,66,000	22.0	502.0	462.0
15	24421.0	568.0	1242.827566	532.310348	0.0	NaN	66.0	1.0	cell_summary	1016.0	NaN	15.0	0.0	1462.0	Charged Rs. 59,10,000	21.0	502.0	446.0
16	24894.0	567.0	2098.311762	531.411304	0.0	16.0	65.0	1.0	cell_summary	1869.0	NaN	16.0	0.0	2317.0	Total Rs. 62,09,76,000	22.0	502.0	448.0
20	9843.0	679.0	1933.956416	648.331505	0.0	16.0	59.0	1.0	cell_header	1833.0	NaN	20.0	0.0	2039.0	Charged Rs.	11.0	620.0	206.0
21	8152.0	672.0	1678.513494	646.764475	0.0	16.0	51.0	1.0	cell_header	1593.0	NaN	21.0	0.0	1767.0	Voted Rs.	9.0	621.0	174.0
22	7692.0	672.0	2232.769501	646.766381	0.0	16.0	51.0	1.0	cell_header	2154.0	NaN	22.0	0.0	2316.0	Total Rs.	9.0	621.0	162.0
26	14864.0	791.0	775.951023	760.459634	0.0	NaN	58.0	1.0	cell_grouping	618.0	NaN	26.0	0.0	938.0	Gross Expenditure	17.0	733.0	320.0
0	19317.0	789.0	1671.421235	782.727390	1.0	NaN	56.0	0.0	cell_values	1553.0	NaN	27.0	0.0	1767.0	61,50,66,000	22.0	733.0	214.0
1	19317.0	845.0	1671.421235	782.727390	1.0	NaN	56.0	0.0	cell_values	1553.0	NaN	27.0	0.0	1767.0	-2,61,000	22.0	789.0	214.0
28	8747.0	790.0	1957.048588	759.089859	1.0	NaN	57.0	0.0	cell_values	1873.0	5910000.0	28.0	0.0	2042.0	59,10,000	9.0	733.0	169.0
0	19346.0	789.0	2221.338261	782.709604	1.0	NaN	56.0	0.0	cell_values	2103.0	NaN	29.0	0.0	2317.0	62,09,76,000	22.0	733.0	214.0
1	19346.0	845.0	2221.338261	782.709604	1.0	NaN	56.0	0.0	cell_values	2103.0	NaN	29.0	0.0	2317.0	-2,61,000	22.0	789.0	214.0
30	6448.0	840.0	640.050093	816.051489	0.0	NaN	51.0	1.0	cell_grouping	570.0	NaN	30.0	0.0	909.0	Deduct -Recoveries	8.0	789.0	339.0
32	1311.0	840.0	2016.483600	824.467582	0.0	NaN	30.0	0.0	cell_values	1995.0	NaN	32.0	0.0	2039.0	...	3.0	810.0	44.0
36	13501.0	960.0	857.588253	929.038368	0.0	NaN	58.0	1.0	cell_grouping	720.0	NaN	36.0	0.0	1004.0	Net Expenditure	15.0	902.0	284.0
37	11192.0	959.0	1659.620622	928.225965	1.0	NaN	57.0	0.0	cell_values	1553.0	614805000.0	37.0	0.0	1767.0	61,48,05,000	12.0	902.0	214.0
38	8747.0	959.0	1957.048588	928.089859	1.0	NaN	57.0	0.0	cell_values	1873.0	5910000.0	38.0	0.0	2042.0	59,10,000	9.0	902.0	169.0
39	11226.0	959.0	2209.406378	928.139319	1.0	NaN	57.0	0.0	cell_values	2103.0	620715000.0	39.0	0.0	2317.0	62,07,15,000	12.0	902.0	214.0
43	37051.0	1084.0	1229.191007	1053.613938	0.0	NaN	59.0	1.0	title	911.0	NaN	43.0	0.0	1549.0	REVENUE EXPENDITURE	19.0	1025.0	638.0
45	20972.0	1140.0	1230.401821	1113.837784	0.0	NaN	51.0	1.0	title	1019.0	NaN	45.0	0.0	1440.0	ABSTRACT ACCOUNT	16.0	1089.0	421.0
49	6212.0	1261.0	1675.519961	1229.796523	0.0	NaN	59.0	1.0	NaN	1613.0	NaN	49.0	0.0	1740.0	Budget	6.0	1202.0	127.0
50	6578.0	1253.0	1943.899970	1228.496047	0.0	NaN	51.0	1.0	NaN	1875.0	NaN	50.0	0.0	2015.0	Revised	7.0	1202.0	140.0
51	6254.0	1261.0	2225.048129	1229.807963	0.0	NaN	59.0	1.0	NaN	2162.0	NaN	51.0	0.0	2290.0	Budget	6.0	1202.0	128.0
52	15406.0	1365.0	1401.864858	1316.907763	0.0	52.0	107.0	1.0	cell_header	1313.0	NaN	52.0	1.0	1492.0	Actuals,\n2015-2016	18.0	1258.0	179.0
53	16239.0	1365.0	1675.118111	1314.721535	0.0	16.0	107.0	1.0	cell_header	1587.0	NaN	53.0	1.0	1766.0	Estimate,\n2016-2017	19.0	1258.0	179.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
103	15930.0	2290.0	1042.681293	2265.474451	0.0	NaN	51.0	1.0	cell_grouping	874.0	NaN	103.0	0.0	1218.0	Grand Total - Gross	19.0	2239.0	344.0
104	11140.0	2296.0	1384.866876	2265.031688	1.0	NaN	57.0	0.0	cell_values	1277.0	412064235.0	104.0	0.0	1492.0	41,20,64,235	12.0	2239.0	215.0
105	11232.0	2296.0	1659.334046	2265.156428	1.0	NaN	57.0	0.0	cell_values	1553.0	608951000.0	105.0	0.0	1767.0	60,89,51,000	12.0	2239.0	214.0
106	11284.0	2296.0	1933.903314	2265.162708	1.0	NaN	57.0	0.0	cell_values	1827.0	563709000.0	106.0	0.0	2042.0	56,37,09,000	12.0	2239.0	215.0
107	11232.0	2296.0	2209.386485	2265.143162	1.0	NaN	57.0	0.0	cell_values	2103.0	620976000.0	107.0	0.0	2317.0	62,09,76,000	12.0	2239.0	214.0
111	5085.0	2403.0	1163.480826	2377.670403	0.0	NaN	51.0	1.0	cell_grouping	1108.0	NaN	111.0	0.0	1218.0	Voted	5.0	2352.0	110.0
0	19416.0	2407.0	1395.550783	2402.243871	1.0	NaN	55.0	0.0	cell_values	1277.0	NaN	112.0	0.0	1493.0	41,04,44,905	22.0	2352.0	216.0
1	19416.0	2462.0	1395.550783	2402.243871	1.0	NaN	55.0	0.0	cell_values	1277.0	NaN	112.0	0.0	1493.0	16,19,330	22.0	2407.0	216.0
0	19639.0	2407.0	1669.992158	2402.199145	1.0	NaN	55.0	0.0	cell_values	1553.0	NaN	113.0	0.0	1768.0	60,34,24,000	22.0	2352.0	215.0
1	19639.0	2462.0	1669.992158	2402.199145	1.0	NaN	55.0	0.0	cell_values	1553.0	NaN	113.0	0.0	1768.0	55,27,000	22.0	2407.0	215.0
0	19683.0	2407.0	1944.925926	2402.218107	1.0	NaN	55.0	0.0	cell_values	1828.0	NaN	114.0	0.0	2043.0	55,81,28,000	22.0	2352.0	215.0
1	19683.0	2462.0	1944.925926	2402.218107	1.0	NaN	55.0	0.0	cell_values	1828.0	NaN	114.0	0.0	2043.0	55,81,000	22.0	2407.0	215.0
0	19667.0	2407.0	2220.060558	2402.283216	1.0	NaN	55.0	0.0	cell_values	2103.0	NaN	115.0	0.0	2318.0	61,50,66,000	22.0	2352.0	215.0
1	19667.0	2462.0	2220.060558	2402.283216	1.0	NaN	55.0	0.0	cell_values	2103.0	NaN	115.0	0.0	2318.0	59,10,000	22.0	2407.0	215.0
116	7042.0	2466.0	1143.765124	2435.896904	0.0	NaN	58.0	1.0	cell_grouping	1070.0	NaN	116.0	0.0	1219.0	Charged	7.0	2408.0	149.0
120	11604.0	2572.0	1090.725009	2546.832644	0.0	NaN	51.0	1.0	cell_grouping	969.0	NaN	120.0	0.0	1218.0	NP - Non Plan	13.0	2521.0	249.0
121	11140.0	2578.0	1384.866876	2547.031688	1.0	NaN	57.0	0.0	cell_values	1277.0	412064235.0	121.0	0.0	1492.0	41,20,64,235	12.0	2521.0	215.0
122	11232.0	2578.0	1659.334046	2547.156428	1.0	NaN	57.0	0.0	cell_values	1553.0	608951000.0	122.0	0.0	1767.0	60,89,51,000	12.0	2521.0	214.0
123	11284.0	2578.0	1933.903314	2547.162708	1.0	NaN	57.0	0.0	cell_values	1827.0	563709000.0	123.0	0.0	2042.0	56,37,09,000	12.0	2521.0	215.0
124	11232.0	2578.0	2209.386485	2547.143162	1.0	NaN	57.0	0.0	cell_values	2103.0	620976000.0	124.0	0.0	2317.0	62,09,76,000	12.0	2521.0	214.0
128	5085.0	2684.0	1163.480826	2658.670403	0.0	NaN	51.0	1.0	cell_grouping	1108.0	NaN	128.0	0.0	1218.0	Voted	5.0	2633.0	110.0
0	19416.0	2688.0	1395.550783	2683.243871	1.0	NaN	55.0	0.0	cell_values	1277.0	NaN	129.0	0.0	1493.0	41,04,44,905	22.0	2633.0	216.0
1	19416.0	2743.0	1395.550783	2683.243871	1.0	NaN	55.0	0.0	cell_values	1277.0	NaN	129.0	0.0	1493.0	16,19,330	22.0	2688.0	216.0
0	19639.0	2688.0	1669.992158	2683.199145	1.0	NaN	55.0	0.0	cell_values	1553.0	NaN	130.0	0.0	1768.0	60,34,24,000	22.0	2633.0	215.0
1	19639.0	2743.0	1669.992158	2683.199145	1.0	NaN	55.0	0.0	cell_values	1553.0	NaN	130.0	0.0	1768.0	55,27,000	22.0	2688.0	215.0
0	19683.0	2688.0	1944.925926	2683.218107	1.0	NaN	55.0	0.0	cell_values	1828.0	NaN	131.0	0.0	2043.0	55,81,28,000	22.0	2633.0	215.0
1	19683.0	2743.0	1944.925926	2683.218107	1.0	NaN	55.0	0.0	cell_values	1828.0	NaN	131.0	0.0	2043.0	55,81,000	22.0	2688.0	215.0
0	19667.0	2688.0	2220.060558	2683.283216	1.0	NaN	55.0	0.0	cell_values	2103.0	NaN	132.0	0.0	2318.0	61,50,66,000	22.0	2633.0	215.0
1	19667.0	2743.0	2220.060558	2683.283216	1.0	NaN	55.0	0.0	cell_values	2103.0	NaN	132.0	0.0	2318.0	59,10,000	22.0	2688.0	215.0
133	7042.0	2747.0	1143.765124	2716.896904	0.0	NaN	58.0	1.0	cell_grouping	1070.0	NaN	133.0	0.0	1219.0	Charged	7.0	2689.0	149.0