In [122]:
import numpy as np
threshold = 200     #the average of the darkest values must be _below_ this to count (0 is darkest, 255 is lightest)
obviousness = 50    #how many of the darkest pixels to include (1 would mean a single dark pixel triggers it)

from PIL import Image
from pytesseract import image_to_string
def find_line(vals):
    #implement edge detection once, use many times 
    for i,tmp in enumerate(vals):
        tmp.sort()
        average = float(sum(tmp[:obviousness]))/len(tmp[:obviousness])
        if average <= threshold:
            return i
    return i    #i is left over from failed threshold finding, it is the bounds

def getbox(img):
    #get the bounding box of the interesting part of a PIL image object
    #this is done by getting the darekest of the R, G or B value of each pixel
    #and finding were the edge gest dark/colored enough
    #returns a tuple of (left,upper,right,lower)

    width, height = img.size    #for making a 2d array
    retval = [0,0,width,height] #values will be disposed of, but this is a black image's box 

    pixels = list(img.getdata())
    vals = []                   #store the value of the darkest color
    for pixel in pixels:
        vals.append(min(pixel)) #the darkest of the R,G or B values

    #make 2d array
    vals = np.array([vals[i * width:(i + 1) * width] for i in xrange(height)])

    #start with upper bounds
    forupper = vals.copy()
    retval[1] = find_line(forupper)

    #next, do lower bounds
    forlower = vals.copy()
    forlower = np.flipud(forlower)
    retval[3] = height - find_line(forlower)

    #left edge, same as before but roatate the data so left edge is top edge
    forleft = vals.copy()
    forleft = np.swapaxes(forleft,0,1)
    retval[0] = find_line(forleft)

    #and right edge is bottom edge of rotated array
    forright = vals.copy()
    forright = np.swapaxes(forright,0,1)
    forright = np.flipud(forright)
    retval[2] = width - find_line(forright)

    if retval[0] >= retval[2] or retval[1] >= retval[3]:
        print "error, bounding box is not legit"
        return None
    return tuple(retval)

def crop_image(image_name):
    image = Image.open(image_name)
    box = getbox(image)
    #print "result is: ",box
    result = image.crop(box)
    cropped_image_name = 'cropped_'+image_name
    result.save(cropped_image_name)
    #result.show()
    ratio = (box[2]-box[0])/float(box[3]-box[1])
    return (cropped_image_name, ratio)

In [123]:
image_to_process, ratio = crop_image('pan2.jpg')

In [124]:
ratio


Out[124]:
1.6221374045801527

In [125]:
if ratio < 1:
    img = Image.open(image_to_process)
    img2 = img.rotate(90, expand=True)
    img2.save(image_to_process)
    t = Image.open(image_to_process)

text_from_image = image_to_string(t, lang='eng')

if "INCOME" in text_from_image:
    pass
else:
    img = Image.open(image_to_process)
    img2 = img.rotate(180, expand=True)
    img2.save(image_to_process)
    t = Image.open(image_to_process)
    text_from_image = image_to_string(t, lang='eng')

In [126]:
list_of_words = text_from_image.split('\n')

In [128]:
list_of_words


Out[128]:
[u'g ,',
 u'W Forum',
 u'',
 u'INCOMETAXDEPARTMENT',
 u'',
 u'DMANIKANDAN',
 u'',
 u'DURAISAMY',
 u'',
 u'1 6/07/1 986',
 u'Permanent Accounl Number',
 u'',
 u'BNZPM2501 F',
 u'',
 u'@- \u201cMN? \u201c\u201863:"?',
 u'',
 u'Signature']

In [118]:
cleaned_words = []
for word in list_of_words:
    stripped_word = word.strip()
    if len(stripped_word)>1:
        cleaned_words.append(stripped_word)

In [119]:
for i, word in enumerate(cleaned_words):
    if "INCOME" in word:
        break

In [120]:
user_name = cleaned_words[3]
fathers_name = cleaned_words[4]
dob = cleaned_words[5].replace(' ', '')

pan_card_no = cleaned_words[7].replace(' ', '')

In [121]:
user_name, fathers_name, dob, pan_card_no


Out[121]:
(u'DMANIKANDAN', u'DURAISAMY', u'16/07/1986', u'BNZPM2501F')

In [80]:
dob.replace(' ', '')


Out[80]:
u'16/07/1986'

In [ ]: